def run_ag_and_verify_results(cmd_info): args = [ag_exe_path()] + cmd_info.cmd.split() (stdout, stderr, errcmd) = util.run_cmd(*args) if errcmd != 0: fatal("Error %d. Stdout:\n'%s'\n Stderr:\n'%s'\n" % (errcmd, stdout, stderr)) if stderr != "": fatal("Non-empty stderr. Stdout:\n'%s'\n Stderr:\n'%s'\n" % (stdout, stderr)) # TODO: don't know why there's 0 at the end of stdout, so strip it if len(stdout) > 0 and stdout[-1] == chr(0): stdout = stdout[:-1] result = util.normalize_str(stdout) if len(result) > 0 and result[-1] == '\n': result = result[:-1] expected = util.normalize_str(cmd_info.expected) if result != expected: fatal("Unexpected value. Stdout:\n'%s'\nExpected:\n'%s'\n" % (result, expected))
def get_members(self): """Get all known parliament members within this session Returns: list(Member): list of all known members within the session """ if not self.members: with open(f'data/composition/{self.session}.json') as json_file: data = json.load(json_file) for entry in data: member = Member.from_json(entry) self.members.append(member) # Now that we have all members, link them for member, entry in zip(self.members, data): if 'replaces' in entry: replaces = entry['replaces'] for replacement in replaces: referenced_member = self.find_member( replacement['name']) del replacement['name'] replacement['member'] = referenced_member.uuid member.set_replaces(replaces) self._members_fn_ln = { normalize_str(f'{member.last_name} {member.first_name}'): member for member in self.members } return self.members
def get_members_dict(self): if not self.members_dict: self.members_dict = {} for member in self.members: first_name = normalize_str(member.first_name).decode() last_name = normalize_str(member.last_name).decode() self.members_dict[f'{first_name}, {last_name}'] = member self.members_dict[ f'{first_name}, {last_name} {member.party}'] = member self.members_dict[ f'{first_name}, {last_name}, {member.party}'] = member self.members_dict[f'{first_name}, {last_name}'.replace( '-', ' ')] = member if member.party == "Vooruit": self.members_dict[ f'{first_name}, {last_name}, sp.a'] = member self.members_dict[ f'{first_name}, {last_name} sp.a'] = member return self.members_dict
def _initialize(self, retry=0): page = self.session.requests_session.get(self.description_uri()) soup = BeautifulSoup(page.content, 'lxml', from_encoding=page.encoding) content = soup.find('div', {'id': 'Story'}) if not content or "not found" in content.get_text(): return if "Er heeft zich een fout voorgedaan" in content.get_text(): if retry >= 10: print('Gave up on', self.description_uri()) return else: self._initialize(retry=retry + 1) return proposal_date = soup.find('td', text=re.compile('Indieningsdatum')) if not proposal_date: proposal_date = soup.find('td', text=re.compile('[0-9]+/[0-9]+/[0-9]+')) if proposal_date: self.date = dateparser.parse(proposal_date.get_text(), languages=['nl']) else: self.date = dateparser.parse( proposal_date.parent.find_all('td')[-1].get_text(), languages=['nl']) descriptor = soup.find( 'td', text=re.compile('Eurovoc-hoofddescriptor')) if descriptor: self.descriptor = descriptor.parent.find_all('td')[-1].get_text().split(' | ') keywords = soup.find('td', text=re.compile('Eurovoc descriptoren')) if keywords: self.keywords = keywords.parent.find_all( 'td')[-1].get_text().split(' | ') title = content.find('h4') if title: self.title = title.get_text().strip() doc_type_row = [tag for tag in soup.find_all( 'td', {'class': "td1x"}) if 'Document type' in tag.get_text()] self.document_type = doc_type_row[0].parent.find( 'td', {'class': 'td0x'}).find_all(text=True)[0][3:] authors = [tag for tag in soup.find_all( 'td', {'class': "td1x"}) if 'Auteur(s)' in tag.get_text()] if authors: authors = authors[0].parent.find( 'td', {'class': 'td0x'}).find_all(text=True) authors = [text.strip() for text in authors if ( not str(text).isspace()) and ', ' in text] for name in authors: name = normalize_str(name).decode() if name in self.session.get_members_dict(): self.authors.append(self.session.get_members_dict()[name]) elif extract_name(name) in self.session.get_members_dict(): self.authors.append(self.session.get_members_dict()[ extract_name(name)]) else: print("D:" + name)
def has_name(self, query: str): """Compare the query string with the "{last_name} {first_name}" combination of this member, ignoring any diactritical characters. Alternative names are also possible for the member, this is sometimes necessary. Args: query (str): Name as seen in the meeting notes of the parliament. Returns: bool: Is this the name of this member """ query = normalize_str(query) name = normalize_str("%s %s" % (self.last_name, self.first_name)) # Fallback for alternative names if self.alternative_names: for n in self.alternative_names: if query == normalize_str(n): return True # Fallback for meetings in session 52, < 90 if query == normalize_str(self.last_name): return True return query == name or query == normalize_str( f'{self.first_name} {self.last_name}')
def _initialize(self, retry=0): page = self.session.requests_session.get(self.description_uri()) soup = BeautifulSoup(page.content, 'lxml', from_encoding=page.encoding) body = soup.find('body') if not body or "does not exist" in body.get_text(): return if "Er heeft zich een fout voorgedaan" in body.get_text(): if retry >= 10: print('Gave up on', self.description_uri()) return else: self._initialize(retry=retry + 1) return authors = [tag for tag in soup.find_all( 'td') if 'Auteur(s)' in tag.get_text()] if authors: authors = authors[0].parent.find_all( 'td')[1].get_text().split('\n') authors = [','.join(text.strip().split( ',')[:-1]) for text in authors if (not str(text).isspace()) and ', ' in text] for name in authors: name = normalize_str(name).decode() if name in self.session.get_members_dict(): self.authors.append(self.session.get_members_dict()[name]) elif extract_name(name) in self.session.get_members_dict(): self.authors.append(self.session.get_members_dict()[ extract_name(name)]) else: print("Q:" + name) responding_minister_cell = soup.find( 'i', text=re.compile('Antwoordende minister')) if responding_minister_cell: self.responding_minister = responding_minister_cell.find_parent('tr').find_all('td')[ 1].get_text().strip()[:-1] self.responding_department = responding_minister_cell.find_parent('tr').find_next('tr').get_text().strip() title = soup.find('i', text=re.compile('Titel')) if title: self.title = title.find_parent('tr').find_all('td')[ 1].get_text().strip() self.title = "\n".join(item.strip() for item in self.title.split('\n') if item.strip()) date = soup.find('i', text=re.compile('Datum bespreking')) if date: self.date = dateparser.parse( date.find_parent('tr').find_all('td')[1].get_text().strip(), languages=['nl'])
def parse_topics(language): classes = Meeting.language_mapping[language] titles = soup.find_all('p', {'class': classes[1]}) current_title = "" while titles: item = titles.pop() if not clean_string(item.text): continue while not re.match("([0-9]+) (.*)", clean_string( item.text)): current_title = clean_string( item.text) + '\n' + current_title item = titles.pop() m = re.match("([0-9]+) (.*)", clean_string(item.text)) current_title = m.group(2) + '\n' + current_title section = item.find_previous_sibling( "p", {"class": classes[0]}) item = int(m.group(1)) if not item in self.topics: self.topics[item] = MeetingTopic( self.parliamentary_session, self, item) self.topics[item].set_title(language, current_title.rstrip()) self.topics[item].set_section( language, clean_string(section.text) if section else ("Algemeen" if language == Language.NL else "Generale")) self.topics[item].complete_type() if language == Language.NL: title = normalize_str( current_title.rstrip().lower()).decode() for member in self.parliamentary_session.get_members(): if member.normalized_name() in title: member.post_activity( TopicActivity(member, self, self.topics[item])) current_title = ""
def find_member(self, query: str): """Using their name as listed in the meeting notes find the Member object related. Args: query (str): String formatted as is typical in the meeting notes ("{last_name} {first_name}") Returns: Member: returns the related Member if one is found. """ if not self.members: self.get_members() normalized = normalize_str(query) if normalized in self._members_fn_ln: return self._members_fn_ln[normalized] for member in self.members: if member.has_name(query): return member print(f'Undefined member: {query}')
def get_members(self): """Get all known parliament members within this session Returns: list(Member): list of all known members within the session """ if not self.members: with open('data/composition/%d.json' % self.session) as json_file: data = json.load(json_file) for entry in data: # TODO: member should probably take entry at construction time instaed of using these setters member = Member(entry['first_name'], entry['last_name'], entry['party'], entry['province'], entry['language'], entry['wiki']) if 'alternative_names' in entry: member.set_alternative_names( entry['alternative_names']) member.set_gender(entry['gender']) member.set_date_of_birth(entry['date_of_birth']) if 'photo_url' in entry: member.set_photo_url(entry['photo_url']) self.members.append(member) # Now that we have all members, link them for member, entry in zip(self.members, data): if 'replaces' in entry: replaces = entry['replaces'] for replacement in replaces: referenced_member = self.find_member( replacement['name']) del replacement['name'] replacement['member'] = referenced_member.uuid member.set_replaces(replaces) self._members_fn_ln = { normalize_str(f'{member.last_name} {member.first_name}'): member for member in self.members } return self.members
publication_doi = None if 'group-title' in data: subject = data['group-title'] else: subject = None authors = [] author_names = [] author_surnames = [] if 'author' in data: for author in data['author']: for key in author.keys(): if key in ['family', 'given', 'suffix', 'name']: author[key] = util.normalize_str(author[key]) author_name = '' if 'given' in author: author_name = author['given'] if 'family' in author: author_name = author_name + ' ' + author['family'] author_surnames.append(author['family']) if 'suffix' in author: author_name = author_name + ' ' + author['suffix'] # it seems that if name is present, there are no other name fields (e.g. given, family) if 'name' in author: author_name = author_name + ' ' + author['name']
def normalized_name(self): return normalize_str( ("%s %s" % (self.first_name, self.last_name)).lower()).decode()
def saveSheet(species): # Create a workbook and add a worksheet. path = os.path.join(config.dirname, "plant_flora_valid_names.xlsx") workbook = xlsxwriter.Workbook(path) worksheet = workbook.add_worksheet() col = 0 row = 1 title_format = workbook.add_format(properties={'font_color': 'red'}) # put title on row 0 worksheet.write(0, 0, "Nome Especie", title_format) worksheet.write(0, 1, "Status Flora", title_format) worksheet.write(0, 2, "Nome Flora", title_format) worksheet.write(0, 3, "Observacao", title_format) worksheet.write(0, 4, "Status Plantlist", title_format) worksheet.write(0, 5, "Nome Plantlist", title_format) worksheet.write(0, 6, "Observacao", title_format) worksheet.write(0, 7, "Flora x Plantlist", title_format) for specie in species: try: flora_plant = "" obs_flora = "" obs_plantlist = "" # check flora x plant if (specie.__contains__("florabrasil") and specie.__contains__("plantlist") and specie["florabrasil"] != "" and specie["plantlist"] != ""): name_plantlist = specie["plantlist"].split( " ")[0] + " " + specie["plantlist"].split(" ")[1] name_florabrasil = specie["florabrasil"].split( " ")[0] + " " + specie["florabrasil"].split(" ")[1] flora_plant = "diferente" if name_plantlist != name_florabrasil else "" if (specie.__contains__("florabrasil") and specie["florabrasil"] != "" and normalize_str(specie["florabrasil"]) != normalize_str(specie["nome"])): nome_tokens = specie["nome"].split(" ") flora_token = specie["florabrasil"].split(" ") if (normalize_str(nome_tokens[0]) != normalize_str(flora_token[0])): obs_flora = "Genero Diferente" elif (normalize_str(nome_tokens[1]) != normalize_str(flora_token[1])): obs_flora = "Especie Diferente" else: obs_flora = "Autor Diferente" if (specie.__contains__("plantlist") and specie["plantlist"] != "" and specie["plantlist"] != specie["nome"]): nome_tokens = specie["nome"].split(" ") plantlist_token = specie["plantlist"].split(" ") if (nome_tokens[0] != plantlist_token[0]): obs_plantlist = "Genero Diferente" elif (nome_tokens[1] != plantlist_token[1]): obs_plantlist = "Especie Diferente" else: obs_plantlist = "Autor Diferente" worksheet.write(row, 0, specie["nome"]) worksheet.write(row, 1, specie["status_florabrasil"] if specie["status_florabrasil"] != "nao_encontrado" else "") worksheet.write(row, 2, specie["florabrasil"] if specie.__contains__( "florabrasil") else "") worksheet.write(row, 3, obs_flora) worksheet.write(row, 4, specie["status_plantlist"] if specie["status_plantlist"] != "nao_encontrado" else "") worksheet.write(row, 5, specie["plantlist"] if specie.__contains__( "plantlist") else "") worksheet.write(row, 6, obs_plantlist) worksheet.write(row, 7, flora_plant) row += 1 except Exception as e: print(e) workbook.close()
if first_author_str == '': first_author_str = first_author['full_name'] break if first_author_str != '': break except: pass osf_records[data['id']]['first_author_str'] = first_author_str new_records_processed = 0 with open(OUTPUT_FILE, 'a') as o: for id_, record in osf_records.items(): result = crossref.search_title_author(util.normalize_str(record['title']), util.normalize_str(record['first_author_str']), record['doi']) result['id'] = id_ new_records_processed += 1 json.dump(result, o) o.write('\n') o.flush() print('Search results for {} records collected.'.format(new_records_processed)) ########################################################## # input(osf_records[data['DOI'].split('/')[2]])
if author['sequence'] == 'first': if 'given' in author: first_author_str = author['given'] if 'family' in author: first_author_str = first_author_str + ' ' + author[ 'family'] if 'suffix' in author: first_author_str = first_author_str + ' ' + author[ 'suffix'] # it seems that if name is present, there are no other name fields (e.g. given, family) if 'name' in author: first_author_str = first_author_str + ' ' + author[ 'name'] break result = crossref.search_title_author( util.normalize_str(title), util.normalize_str(first_author_str), record['DOI']) result['id'] = record['DOI'] new_records_processed += 1 json.dump(result, o) o.write('\n') o.flush() print('Search results for {} records collected.'.format(new_records_processed))