def sort_name_to_display_name(sort_name): """ Take the "Last Name, First Name"-formatted sort_name, and convert it to a "First Name Last Name" format appropriate for displaying to patrons in a catalog listing. While the code attempts to do the best it can, name recognition gets complicated really fast when there's more than one plain-format first name and one plain-format last name. This code is meant to serve as first line of approximation. If we later on can find better human librarian-checked sort and display names in the Metadata Wrangler, we use those. :param sort_name Doe, Jane :return display_name Jane Doe """ if not sort_name: return None name = HumanName(sort_name) # name has title, first, middle, last, suffix, nickname if name.nickname: name.nickname = '(' + name.nickname + ')' display_name = u' '.join([name.title, name.first, name.nickname, name.middle, name.last, name.suffix]) display_name = name_tidy(display_name) return display_name
def clean_authors(authors): cleaned_authors = [] authors = authors.lower() # get rid of commas where there are suffixes, like Jr. or III authors = authors.replace(", jr.", " jr.") authors = authors.replace(", iii", " iii") authors = authors.replace(", ph.d", "") # special cases authors = authors.replace("organizer:", "") authors = authors.replace("roel m,", "roel m.") if authors == 'kozue miyashiro, etsuko harada, t.': author_list = ['kozue miyashiro', 'etsuko harada, t.'] else: author_list = authors.split(",") for author in author_list: author = HumanName(author.lower()) if author.first == '' or author.last == '': raise ValueError("invalid author name: {}".format(author)) author.capitalize() author.string_format = u"{last}, {title} {first} {middle}, {suffix}" cleaned_authors.append(unicode(author)) return cleaned_authors
def course_event_title_and_contact(course): try: section = get_sws_section(course) meeting = section.meetings[0] if hasattr( section, 'meetings') and len(section.meetings) else None instructor = meeting.instructors[0] if hasattr( meeting, 'instructors') and len(meeting.instructors) else None first_name = instructor.first_name if hasattr( instructor, 'first_name') else '' surname = instructor.surname if hasattr(instructor, 'surname') else '' uwnetid = instructor.uwnetid if hasattr(instructor, 'uwnetid') else '' email = instructor.email1 if hasattr(instructor, 'email1') else '' name = HumanName(' '.join([first_name, surname])) name.capitalize() except DataFailureException as err: if err.status == 404: section = None name = None email = None uwnetid = None else: raise return { 'title_long': section.course_title_long if section else '', 'name': '%s %s' % (name.first, name.last) if name else '', 'uwnetid': uwnetid if uwnetid else '', 'email': email if email and len(email) else "*****@*****.**" % uwnetid if uwnetid else '' }
def set_parsed_name(self): if not self.name: self.parsed_name = None return name = HumanName(self.name) self.parsed_name = name.as_dict()
def normalize(name): norm_1 = unidecode(" ".join(name.strip().lower().split())).replace("-", " ") norm_2 = re.sub(r'<[^>]+>', r'', norm_1) # remove html hname = HumanName(norm_2) hname.string_format = '{first} {middle} {last}' # return str(hname).replace() return re.sub(r'[^a-z\s]', r'', str(hname))
def parse_name(item): """ Parses a name string into an Author object. :param item: String containing author name and possibly email :return: Author """ # Find emails from regex email_group = emailRegex.search(item) email = None # If emails found, take first email if email_group: email = email_group.group(0) # Remove email from string item = emailWithBracketsRegex.sub("", item) # Remove prefixes from string item = prefixRegex.sub("", item) # Remove suffixes from string item = suffixRegex.sub("", item) # Remove words in brackets () if words exist outside brackets if not bracketRegex.fullmatch(item): item = bracketRegex.sub(" ", item) # Strip extraneous characters from string item = item.strip(strip_chars) # Parse remaining string with HumanNameParser to find name name = HumanName(item) if not name.full_name: app.logger.warning( "Unable to parse name string %s: no full_name returned in name %s", item, name, ) return None # Only force capitalization of names with mixed capitalization above certain percentage name.capitalize(force=force_capitalization(name.full_name)) author = Author() if name.first: author.givenname = name.first if name.last: author.familyname = name.last if name.middle: author.middlename = name.middle if email: author.email = email if not name.first and not name.last: author.name = name.full_name else: author.create_full_name() return author
def test_assignment_to_full_name(self): hn = HumanName("John A. Kenneth Doe, Jr.") self.m(hn.first,"John", hn) self.m(hn.last,"Doe", hn) self.m(hn.middle,"A. Kenneth", hn) self.m(hn.suffix,"Jr.", hn) hn.full_name = "Juan Velasquez y Garcia III" self.m(hn.first,"Juan", hn) self.m(hn.last,"Velasquez y Garcia", hn) self.m(hn.suffix,"III", hn)
def display_full_name_with_correct_capitalization(full_name): """ See documentation here: https://github.com/derek73/python-nameparser :param full_name: :return: """ full_name.strip() full_name_parsed = HumanName(full_name) full_name_parsed.capitalize() full_name_capitalized = str(full_name_parsed) return full_name_capitalized
def get_display_name(self): if self.has_display_name(): return self.display_name if self.has_first_name(): name = HumanName("%s %s" % (self.first_name, self.last_name)) else: name = HumanName(self.last_name) name.capitalize() name.string_format = "{first} {last}" return str(name)
def user_fullname(user): if hasattr(user, 'display_name'): if ((user.display_name is None or not len(user.display_name) or user.display_name.isupper()) and hasattr(user, 'first_name')): fullname = HumanName('%s %s' % (user.first_name, user.surname)) fullname.capitalize() fullname.string_format = '{first} {last}' return str(fullname) else: return user.display_name elif hasattr(user, 'email'): return user.email.split('@')[0] # CanvasUser else: raise UserPolicyException('Invalid user')
def extractFirstName(name, order): '''Split on dots''' name = ' '.join(name.split('.')) '''Split on - ''' name = ' '.join(name.split('-')) '''Replace numbers by whitespace''' oldname = name name = re.sub("\d+", "", name) if not len(name): name = re.sub("\d+", "_", oldname) oldname = name '''Replace ? by whitespace''' name = re.sub("\?", "", name) if not len(name): name = re.sub("\?", "_", oldname) name = ' '.join(name.split('_')) '''Use the Python name parser''' try: firstName = getFirstNameFromHumanName(HumanName(name), order) except: firstName = getFirstNameFromSplitName(name.split(), order) '''If fail, use heuristics''' if firstName.strip() == name.strip(): '''firstName('Ben Voigt') = 'Ben Voigt'!!!''' if len(name.split()) == 2: firstName = getFirstNameFromSplitName(name.split(), order) else: '''Try CamelCase''' uncamel = ' '.join(splitCamelCase(name).split('_')) if uncamel != name: try: firstName = HumanName(uncamel).first if len(firstName.split()) == 2: firstName = getFirstNameFromSplitName(firstName.split(), order) except: firstName = getFirstNameFromSplitName(uncamel.split(), order) if firstName == 'Mc': firstName = '' if len(firstName) == 1: firstName = '' return firstName.lower()
def catogorize_by_instructor(data): instructorsDict = {} for fce in data: if 'instructor' in fce and len(fce['instructor']) > 2: name = HumanName(fce['instructor']) name.capitalize() instructor = "{} {}".format(name.first, name.last).strip() if len(instructor) > 2: course = Course(fce) if instructor in instructorsDict: instructorsDict[instructor]['courses'].append(course) else: instructorsDict[instructor] = Instructor(str(name)) instructorsDict[instructor]['courses'].append(course) return instructorsDict
def display_name_to_sort_name(display_name): """ Take the "First Name Last Name"-formatted display_name, and convert it to a "Last Name, First Name" format appropriate for searching and sorting by. Checks first if the display_name fits what we know of corporate entity business names. If yes, uses the whole name without re-converting it. Uses the HumanName library to try to parse the name into parts, and rearrange the parts into desired order and format. """ if not display_name: return None # TODO: to humanname: PhD, Ph.D. Sister, Queen are titles and suffixes # check if corporate, and if yes, return whole if is_corporate_name(display_name): return display_name # clean up the common PhD and MD suffixes, so HumanName recognizes them better display_name = name_tidy(display_name) # name has title, first, middle, last, suffix, nickname name = HumanName(display_name) if name.nickname: name.nickname = '(' + name.nickname + ')' # Note: When the first and middle names are initials that have come in with a space between them, # let them keep that space, to be consistent with initials with no periods, which would be more # easily algorithm-recognized if they were placed separately. So: # 'Classy, A. B.' and 'Classy Abe B.' and 'Classy A. Barney' and 'Classy, Abe Barney' and 'Classy, A B'. if not name.last: # Examples: 'Pope Francis', 'Prince'. sort_name = u' '.join([name.first, name.middle, name.suffix, name.nickname]) if name.title: sort_name = u''.join([name.title, ", ", sort_name]) else: sort_name = u' '.join([name.first, name.middle, name.suffix, name.nickname, name.title]) sort_name = u''.join([name.last, ", ", sort_name]) sort_name = name_tidy(sort_name) return sort_name
def show_representatives(): form = AddressLookup() if form.validate_on_submit(): session['address'] = form.address.data return redirect(url_for('main.show_representatives')) address = session.get('address') form.address.data = address representatives = {} if address: representatives =\ get_representativeinfo(session.get('address'), current_app.config['ELECTION_API_KEY']) for representative in representatives['officials']: human_name = HumanName(representative['name']) logging.debug(human_name.as_dict()) representative['first_name'] = human_name.first representative['last_name'] = human_name.last return render_template('show_representativeinfo.html', representatives=representatives, lookupform=form)
def add(): global initialised if not initialised: initialise() name = HumanName(request.forms.get('name')) first_name = name.first.upper() last_name = name.last.upper() department = request.forms.get('department').upper() designation = request.forms.get('designation').upper() photo = request.files.get('photo') name, ext = os.path.splitext(photo.filename) if ext not in ('.png', '.jpg', '.jpeg', '.JPG', '.JPEG', '.PNG'): rv = {"status": "photo_invalid"} return dict(rv) if not os.path.exists(folder_name + "/images/"): os.makedirs(folder_name + "/images/") name = first_name + "_" + last_name + "_" + department + "_" + designation photo.filename = name.replace(" ", "_") + ext photo.save(folder_name + "/images/") with open(folder_name + "/" + folder_name + ".csv", 'ab+') as csv_file: fieldnames = ['firstname', 'lastname', 'designation', 'department', 'photo'] csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames) csv_writer.writerow({ 'firstname': first_name, 'lastname': last_name, 'department': department, 'designation': designation, 'photo': "images/"+photo.filename }) rv = {"status": "ok"} return dict(rv)
def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" self.m(hn.last,"de la Vega", hn) hn.title = "test" self.m(hn.title,"test", hn) hn.first = "test" self.m(hn.first,"test", hn) hn.middle = "test" self.m(hn.middle,"test", hn) hn.suffix = "test" self.m(hn.suffix,"test", hn)
def standardize(res): # Standardize name name = HumanName(res['list_name']) name.capitalize() res['list_name'] = str(name) if 'detail_name' in res: dname = HumanName(res['detail_name']) dname.capitalize() res['detail_name'] = str(dname) # Lowercase email if 'list_email' in res: res['list_email'] = res['list_email'].lower() # Remove `Faculty - ` from affiliation res['list_affiliation'] = res['list_affiliation'].replace('Faculty - ', '') return res
def HumanNameFmXML(self, ell): hn = HumanName() for el in ell: if el.tag == 'First': hn.first = el.text elif el.tag == 'Middle': hn.middle = el.text elif el.tag == 'Last': hn.last = el.text elif el.tag == 'Title': hn.title = el.text elif el.tag == 'Suffix': hn.suffix = el.text elif el.tag == 'NickName': hn.nickname = el.text else: pass return hn
def person_name_from_xml(self, ell): '''Create a person mane from an XML element.''' hname = HumanName() for elm in ell: if elm.tag == 'First': hname.first = elm.text elif elm.tag == 'Middle': hname.middle = elm.text elif elm.tag == 'Last': hname.last = elm.text elif elm.tag == 'Title': hname.title = elm.text elif elm.tag == 'Suffix': hname.suffix = elm.text elif elm.tag == 'NickName': hname.nickname = elm.text else: pass return hname
def parse_rss(message): """ Parse Feeds into the CMS Module """ db = current.db s3db = current.s3db table = s3db.msg_rss record = db(table.message_id == message.message_id).select( table.channel_id, table.title, table.from_address, table.body, table.date, table.location_id, table.tags, table.author, limitby=(0, 1)).first() if not record: return post_table = s3db.cms_post # Is this an Update or a Create? body = record.body or record.title url = record.from_address if url: doc_table = s3db.doc_document exists = db(doc_table.url == url).select(doc_table.doc_id, limitby=(0, 1)).first() if exists: exists = db(post_table.doc_id == exists.doc_id).select( post_table.id, limitby=(0, 1)).first() else: # Use Body exists = db(post_table.body == body).select(post_table.id, limitby=(0, 1)).first() channel_id = record.channel_id tags = record.tags author = record.author if author: ptable = s3db.pr_person # https://code.google.com/p/python-nameparser/ from nameparser import HumanName name = HumanName(author) first_name = name.first middle_name = name.middle last_name = name.last query = (ptable.first_name == first_name) & \ (ptable.middle_name == middle_name) & \ (ptable.last_name == last_name) pexists = db(query).select(ptable.id, limitby=(0, 1)).first() if pexists: person_id = pexists.id else: person_id = ptable.insert(first_name=first_name, middle_name=middle_name, last_name=last_name) s3db.update_super(ptable, {"id": person_id}) else: person_id = None if exists: post_id = exists.id db(post_table.id == post_id).update( title=record.title, body=body, # @ToDo: Remove created_on when we know not used in rendering created_on=record.date, date=record.date, location_id=record.location_id, person_id=person_id, ) # Read existing Tags (which came from remote) ttable = db.cms_tag ltable = db.cms_tag_post query = (ltable.post_id == post_id) & \ (ltable.mci == 1) & \ (ltable.tag_id == ttable.id) rows = db(query).select(ttable.name) # Compare these to tags in current version of post old_tags = [r.name for r in rows] new_tags = [] delete_tags = [] for tag in tags: if tag not in old_tags: new_tags.append(tag) for tag in old_tags: if tag not in tags: delete_tags.append(tag) if new_tags or delete_tags: lookup_tags = [] lookup_tags.extend(new_tags) lookup_tags.extend(delete_tags) _tags = db(ttable.name.belongs(lookup_tags)).select( ttable.id, ttable.name, ).as_dict(key="name") for t in new_tags: tag = _tags.get(t, None) if tag: tag_id = tag["id"] else: tag_id = ttable.insert(name=t) ltable.insert( post_id=post_id, tag_id=tag_id, mci=1, # This is an imported record, not added natively ) for t in delete_tags: tag = _tags.get(t, None) if tag: query = (ltable.post_id == post_id) & \ (ltable.tag_id == tag["id"]) & \ (ltable.mci == 1) & \ (ltable.deleted == False) db(query).delete() else: # Default to 'News' series table = db.cms_series series_id = db(table.name == "News").select(table.id, cache=s3db.cache, limitby=(0, 1)).first().id post_id = post_table.insert( title=record.title, body=body, # @ToDo: Remove created_on when we know not used in rendering created_on=record.date, date=record.date, location_id=record.location_id, person_id=person_id, series_id=series_id, mci=1, # This is an imported record, not added natively ) record = {"id": post_id} s3db.update_super(post_table, record) # Source link if url: doc_table.insert( doc_id=record["doc_id"], url=url, ) # Is this feed associated with an Org/Network? def lookup_pe(channel_id): ctable = s3db.msg_rss_channel channel_url = db(ctable.channel_id == channel_id).select( ctable.url, limitby=(0, 1)).first().url ctable = s3db.pr_contact ptable = s3db.pr_pentity query = (ctable.contact_method == "RSS") & \ (ctable.value == channel_url) & \ (ctable.pe_id == ptable.pe_id) pe = db(query).select(ptable.pe_id, ptable.instance_type, limitby=(0, 1)).first() if pe: pe_type = pe.instance_type otable = s3db[pe_type] org_id = db(otable.pe_id == pe.pe_id).select( otable.id, limitby=(0, 1), ).first().id return pe_type, org_id else: return None, None pe_type, org_id = current.cache.ram("pe_channel_%s" % channel_id, lambda: lookup_pe(channel_id), time_expire=120) if pe_type == "org_organisation": s3db.cms_post_organisation.insert( post_id=post_id, organisation_id=org_id, ) elif pe_type == "org_group": s3db.cms_post_organisation_group.insert( post_id=post_id, group_id=org_id, ) if tags: ttable = db.cms_tag ltable = db.cms_tag_post _tags = db(ttable.name.belongs(tags)).select( ttable.id, ttable.name, ).as_dict(key="name") for t in tags: tag = _tags.get(t, None) if tag: tag_id = tag["id"] else: tag_id = ttable.insert(name=t) ltable.insert( post_id=post_id, tag_id=tag_id, mci=1, # This is an imported record, not added natively ) # No Reply return
def execute(self, obj): return HumanName(obj)
def extract_last_name(self): "Extracts last name from name feature using nameparser." self.Xy["last_name"] = self.Xy.name.apply(lambda x: HumanName(x).last)
def parse_rss_2_cap(message): """ Parse RSS Feeds into the CAP Module """ db = current.db s3db = current.s3db table = s3db.msg_rss record = db(table.message_id == message.message_id).select( table.channel_id, table.title, table.from_address, table.body, table.date, table.location_id, table.author, limitby=(0, 1)).first() if not record: return channel_id = record.channel_id alert_table = s3db.cap_alert info_table = s3db.cap_info # Is this an Update or a Create? # @ToDo: Use guid? # Use Body body = record.body or record.title query = (info_table.description == body) exists = db(query).select(info_table.id, limitby=(0, 1)).first() author = record.author if author: ptable = s3db.pr_person # https://code.google.com/p/python-nameparser/ from nameparser import HumanName name = HumanName(author) first_name = name.first middle_name = name.middle last_name = name.last query = (ptable.first_name == first_name) & \ (ptable.middle_name == middle_name) & \ (ptable.last_name == last_name) pexists = db(query).select(ptable.id, limitby=(0, 1)).first() if pexists: person_id = pexists.id else: person_id = ptable.insert(first_name=first_name, middle_name=middle_name, last_name=last_name) s3db.update_super(ptable, dict(id=person_id)) else: person_id = None if exists: # @ToDo: Use XSLT info_id = exists.id db(info_table.id == info_id).update( headline=record.title, description=body, created_on=record.date, #location_id = record.location_id, #person_id = person_id, ) else: # Embedded link url = record.from_address try: file = fetch(url) except urllib2.URLError: response.error = str(sys.exc_info()[1]) return output except urllib2.HTTPError: response.error = str(sys.exc_info()[1]) return output File = StringIO(file) # Import via XSLT resource = s3db.resource("cap_alert") stylesheet = os.path.join(current.request.folder, "static", "formats", "cap", "import.xsl") success = resource.import_xml(File, stylesheet=stylesheet) # No Reply return
def parse_rss_2_cms(message): """ Parse Feeds into the CMS Module """ db = current.db s3db = current.s3db table = s3db.msg_rss record = db(table.message_id == message.message_id).select( table.channel_id, table.title, table.from_address, table.body, table.date, table.location_id, table.tags, table.author, limitby=(0, 1)).first() if not record or not record.body: return post_table = s3db.cms_post # Is this an Update or a Create? body = record.body or record.title url = record.from_address if url: doc_table = s3db.doc_document exists = db(doc_table.url == url).select(doc_table.doc_id, limitby=(0, 1)).first() if exists: exists = db(post_table.doc_id == exists.doc_id).select( post_table.id, limitby=(0, 1)).first() else: # Use Body exists = db(post_table.body == body).select(post_table.id, limitby=(0, 1)).first() channel_id = record.channel_id tags = record.tags author = record.author if author: ptable = s3db.pr_person # https://code.google.com/p/python-nameparser/ from nameparser import HumanName name = HumanName(author) first_name = name.first middle_name = name.middle last_name = name.last query = (ptable.first_name == first_name) & \ (ptable.middle_name == middle_name) & \ (ptable.last_name == last_name) pexists = db(query).select(ptable.id, limitby=(0, 1)).first() if pexists: person_id = pexists.id else: person_id = ptable.insert(first_name=first_name, middle_name=middle_name, last_name=last_name) s3db.update_super(ptable, dict(id=person_id)) else: person_id = None if exists: post_id = exists.id db(post_table.id == post_id).update( title=record.title, body=body, created_on=record.date, location_id=record.location_id, person_id=person_id, ) # Read existing Tags (which came from remote) ttable = db.cms_tag ltable = db.cms_tag_post query = (ltable.post_id == post_id) & \ (ltable.mci == 1) & \ (ltable.tag_id == ttable.id) rows = db(query).select(ttable.name) # Compare these to tags in current version of post old_tags = [r.name for r in rows] new_tags = [] delete_tags = [] for tag in tags: if tag not in old_tags: new_tags.append(tag) for tag in old_tags: if tag not in tags: delete_tags.append(tag) if new_tags or delete_tags: lookup_tags = [] lookup_tags.extend(new_tags) lookup_tags.extend(delete_tags) _tags = db(ttable.name.belongs(lookup_tags)).select( ttable.id, ttable.name, ).as_dict(key="name") for t in new_tags: tag = _tags.get(t, None) if tag: tag_id = tag["id"] else: tag_id = ttable.insert(name=t) ltable.insert( post_id=post_id, tag_id=tag_id, mci=1, # This is an imported record, not added natively ) for t in delete_tags: tag = _tags.get(t, None) if tag: query = (ltable.post_id == post_id) & \ (ltable.tag_id == tag["id"]) & \ (ltable.mci == 1) & \ (ltable.deleted == False) db(query).delete() else: # Default to 'News' series table = db.cms_series series = db(table.name == "News").select(table.id, cache=s3db.cache, limitby=(0, 1)).first() try: series_id = series.id except: raise KeyError("News Series not present in CMS module") post_id = post_table.insert( title=record.title, body=body, created_on=record.date, location_id=record.location_id, person_id=person_id, series_id=series_id, mci=1, # This is an imported record, not added natively ) record = dict(id=post_id) s3db.update_super(post_table, record) # Source link if url: doc_table.insert( doc_id=record["doc_id"], url=url, ) if tags: ttable = db.cms_tag ltable = db.cms_tag_post _tags = db(ttable.name.belongs(tags)).select( ttable.id, ttable.name, ).as_dict(key="name") for t in tags: tag = _tags.get(t, None) if tag: tag_id = tag["id"] else: tag_id = ttable.insert(name=t) ltable.insert( post_id=post_id, tag_id=tag_id, mci=1, # This is an imported record, not added natively ) # No Reply return
def parse_raw_name(name_raw: str, count: int, extract_orgs=True) -> (str, str, str, Counter): """ Parses a (usually messy) raw name and returns first, middle, last names and a Counter of extracted positions extract_orgs tries to extract organizations from name. defaults to True. only set to False to be able to check if a name is valid (it prevents an infinite loop because by default, extracting organizations is part of the initialization of a person :param name_raw: str :param count: int :param extract_orgs: bool :return: str, str, str, Counter (first name, middle name, last name, positions Counter) """ name_raw = Person.remove_privlog_info(name_raw) # remove JR, SR, or III if it follows this pattern: 'Chumney-RD-Jr' name_raw = Person.remove_jr_sr_iii(name_raw) # position is often attached with a dash, # e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS' if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2: name_raw, extracted_position = name_raw.split(" - ") extracted_positions = [extracted_position.strip()] else: extracted_positions = [] # extract positions in parens e.g. Henson, A (Chadbourne & Park) paren_positions = re.findall(r'\([^(]+\)', name_raw) for position in paren_positions: extracted_positions.append(position.strip(',#() ')) name_raw = name_raw.replace(position, '') # Search for known raw_org strings in name_raw, extract them as positions if necessary if extract_orgs: name_raw, new_positions = Person.extract_raw_org_names_from_name(name_raw) extracted_positions += new_positions # delete any leftover hashtags name_raw = name_raw.strip(' #') # Delete dashes between last name and initials # DUNN-W -> Dunn W if name_raw[-2] == '-': name_raw = name_raw[:-2] + " " + name_raw[-1:] # DUNN-WL -> DUNN WL if len(name_raw) > 2 and name_raw[-3] == '-': name_raw = name_raw[:-3] + " " + name_raw[-2:] # Parse current string using HumanName name = HumanName(name_raw) # e.g. Dunn W -> parsed as last name W. -> switch first/last if len(name.last) <= 2 < len(name.first): name.first, name.last = name.last, name.first # remove periods from initials if len(name.first) == 2 and name.first[1] == '.': name.first = name.first[0] if len(name.middle) == 2 and name.middle[1] == '.': name.middle = name.middle[0] # If first name is length 2 (Teague, CE), the two letters are most likely initials. if len(name.middle) == 0 and len(name.first) == 2: name.middle = name.first[1].upper() name.first = name.first[0].upper() # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague" if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first): name.middle = name.first[2] name.first = name.first[0] name.last = name.last.capitalize() name.first = name.first.capitalize() name.middle = name.middle.capitalize() # if multiple names are passed, they often end up in the middle name # e.g. 'Holtzman, A., Murray, J. , Henson, A. -> only allow one comma or set to empty if name.middle.count(',') > 1: name.middle = '' if len(name.suffix) > 20 and name.suffix.count('.') > 2: name.suffix = '' if name.suffix: extracted_positions.append(name.suffix) # map organization names to clean official names (if they are in the dict) using # RAW_ORG_TO_CLEAN_ORG_DICT clean_orgs = [] for raw_org in extracted_positions: if raw_org in RAW_ORG_TO_CLEAN_ORG_DICT: clean_org = RAW_ORG_TO_CLEAN_ORG_DICT[raw_org] if clean_org != '@skip@': clean_orgs.append(clean_org) else: clean_orgs.append(raw_org) extracted_positions = clean_orgs # convert mapped positions into a counter result_positions = Counter() for position in extracted_positions: cleaned = re.sub(r'\.', '', position) result_positions[cleaned.upper()] += count # print(name.first, name.middle, name.last, result_positions) return name.first, name.middle, name.last, result_positions
if 'salutation' not in row: row['salutation'] = '' if 'firstName' not in row: row['salutation'] = '' if 'middleName' not in row: row['middleName'] = '' if 'lastName' not in row: row['lastName'] = '' if 'nameSuffix' not in row: row['nameSuffix'] = '' if 'firstName' not in row: row['firstName'] = '' if 'nickName' not in row: row['nickName'] = '' #parse the full text name and full text address into their components and add them to the row. For each row, we are checking the destination value to ensure it is empty, as users my import component values instead of full text values. parsedName = HumanName(row['fullName']) ap = AddressParser() parsedAddress = ap.parse_address(row['fullTextAddress']) if not row['salutation']: row['salutation'] = parsedName.title if not row['firstName']: People.Name.first = parsedName.first #row['firstName'] = parsedName.first if not row['middleName']: row['middleName'] = parsedName.middle if not row['lastName']: row['lastName'] = parsedName.last if not row['nameSuffix']: row['nameSuffix'] = parsedName.suffix if not row['nickName']: row['nickName'] = parsedName.nickname
def ProcessScan(self, evt): print "Got a scan!" try: license = self.parser.decode(evt.data) except aamva.ReadError as e: #GUI interaction must be done in a thread-safe way wx.CallAfter(self.ErrorMessage, 'Invalid data.\n{0}'.format(e)) return name = HumanName("{} {}".format(xstr(license['first']).lower(), xstr(license['last']).lower())) name.capitalize() query = { 'firstName' : name.first, 'lastName' : name.last, 'address1' : license['address'], 'address2' : xstr(license['address2']), 'city' : license['city'], 'state' : license['state'], 'postalCode' : xstr(license['ZIP'])[0:5]+"-"+xstr(license['ZIP'])[5:], 'country' : license['country'], 'birthdate' : license['dob'] } params = urllib.urlencode(query) if license['expiry'] <= date.today(): wx.CallAfter(self.InfoMessage, str('ID expired {}'.format(license['expiry']))) webbrowser.open(BASE_URL + "?" + params, new=0, autoraise=False) else: webbrowser.open(BASE_URL + "?" + params, new=0, autoraise=True) #clear form self.clearForm() #set the fields self.NameText.SetValue(license['first']) if license['middle'] is not None: self.MiddleText.SetValue(license['middle']) self.SurnameText.SetValue(license['last']) self.DOBText.SetValue(xstr(license['dob'])) self.AddressText.SetValue(license['address']) self.Address2Text.SetValue(xstr(license['address2'])) self.CityText.SetValue(license['city']) self.StateText.SetValue(license['state']) self.ZIPText.SetValue(xstr(license['ZIP'])[0:5]+"-"+xstr(license['ZIP'])[5:]) self.IINText.SetValue(license['IIN']) self.LicenseNoText.SetValue(license['license_number']) self.IssuedText.SetValue(xstr(license['issued'])) self.ExpiresText.SetValue(xstr(license['expiry'])) try: self.CountryText.SetValue(license['country']) except KeyError: self.CountryText.SetValue("???") if license['sex'] == aamva.MALE: self.MaleRadio.SetValue(True) elif license['sex'] == aamva.FEMALE: self.FemaleRadio.SetValue(True) self.HeightText.SetValue(xstr(license['height'])) self.WeightText.SetValue(xstr(license['weight'])) if license['hair'] is None: self.HairText.SetValue("???") else: self.HairText.SetValue(license['hair']) if license['eyes'] is None: self.EyesText.SetValue("???") else: self.EyesText.SetValue(license['eyes']) self.EndorsementsText.SetValue(license['endorsements']) self.RestrictionsText.SetValue(license['restrictions'])
class ParsedName(object): """Class for representing a name. After construction, the instance exposes the fields exposed by `HumanName` instance, i.e. `title`, `first`, `middle`, `last`, `suffix`. """ constants = _prepare_nameparser_constants() """The default constants configuration for `HumanName` to use for parsing all names.""" def __init__(self, name, constants=None): """Create a ParsedName instance. Args: name (Union[str, HumanName]): The name to be parsed (must be non empty nor None). constants (:class:`nameparser.config.Constants`): Configuration for `HumanName` instantiation. (Can be None, if provided it overwrites the default one generated in :method:`prepare_nameparser_constants`.) """ if not constants: constants = ParsedName.constants if isinstance(name, HumanName): self._parsed_name = name else: self._parsed_name = HumanName(name, constants=constants) self._parsed_name.capitalize() def __iter__(self): return self._parsed_name def __len__(self): return len(self._parsed_name) def __repr__(self): return repr(self._parsed_name) def __str__(self): return str(self._parsed_name) @property def title(self): return self._parsed_name.title @property def first(self): return self._parsed_name.first @property def first_list(self): return self._parsed_name.first_list @property def middle(self): return self._parsed_name.middle @property def middle_list(self): return self._parsed_name.middle_list @property def last(self): return self._parsed_name.last @property def last_list(self): return self._parsed_name.last_list @property def suffix(self): return self._parsed_name.suffix @property def suffix_list(self): return self._parsed_name.suffix_list @classmethod def loads(cls, name): """Load a parsed name from a string. Raises: TypeError: when name isn't a type of `six.string_types`. ValueError: when name is empty or None. """ if not isinstance(name, six.string_types): raise TypeError( u'arguments to {classname} must be of type {string_types}'. format(classname=cls.__name__, string_types=repr(six.string_types))) if not name or name.isspace(): raise ValueError('name must not be empty') return cls(name) def dumps(self): """Dump the name to string, after normalizing it.""" def _is_initial(author_name): return len(author_name) == 1 or u'.' in author_name def _ensure_dotted_initials(author_name): if _is_initial(author_name) \ and u'.' not in author_name: seq = (author_name, u'.') author_name = u''.join(seq) return author_name def _ensure_dotted_suffixes(author_suffix): if u'.' not in author_suffix: seq = (author_suffix, u'.') author_suffix = u''.join(seq) return author_suffix def _is_roman_numeral(suffix): """Controls that the user's input only contains valid roman numerals""" valid_roman_numerals = [ u'M', u'D', u'C', u'L', u'X', u'V', u'I', u'(', u')' ] return all(letters in valid_roman_numerals for letters in suffix.upper()) # Create first and middle first_name = _ensure_dotted_initials(self.first) middle_name = _ensure_dotted_initials(self.middle) if _is_initial(first_name) and _is_initial(middle_name): normalized_names = u'{first_name}{middle_name}' else: normalized_names = u'{first_name} {middle_name}' normalized_names = normalized_names.format( first_name=first_name, middle_name=middle_name, ) if _is_roman_numeral(self.suffix): suffix = self.suffix.upper() else: suffix = _ensure_dotted_suffixes(self.suffix) final_name = u', '.join(part for part in (self.last, normalized_names.strip(), suffix) if part) # Replace unicode curly apostrophe to normal apostrophe. final_name = final_name.replace(u'’', '\'') return final_name @classmethod def from_parts(cls, first=None, last=None, middle=None, suffix=None, title=None): name = HumanName() name.first = first name.middle = middle name.last = last name.suffix = suffix name.title = title return ParsedName(name)
def resolve(self, match_entity=None): """ Associate each person name of self.person_name_list to an entity :return: dict idx_of_person_name -> entity, list of indexes that have to be discarded because matched entity is None """ # PRE-PROCESSING STEP : # each person name is parsed using human name parser # each time we succeed to associate a human_name to an entity, we will remove it from this list human_name_list = [ (idx, self.name_preprocessing(person_name)) for idx, person_name in enumerate(self.person_name_list) ] # some name will contain just a title. For instance 'Sir' alone. It will be detected as a character name # by BERT NER but we won't try to associate it with an entity. # by default, we will associate such terms with a unique "NONE" entity remaining_list = [] empty_entity = Entity(HumanName("NONE")) for idx, human_name in human_name_list: if human_name.first == "" and human_name.last == "": self.entities_match[idx] = empty_entity else: remaining_list.append((idx, human_name)) if human_name.first == "``": human_name.first = "" self.entities_match[idx] = human_name human_name_list = remaining_list # STEP 1 : # for each human_name that are complets ie: that contains a title, a first name and last name # -> for instance: Miss Elizabeth Bennet # if there already exists an entity which has this first and last name: associate the human_name to this entity # else : create a new entity print( "Co-ref step 1 : associate character name that have title, first name and last name to entity" ) remaining_list = [ ] # to store the human name we have not succeed to bind to an entity for idx, human_name in tqdm(human_name_list): if human_name.title != "" and human_name.first != "" and human_name.last != "": try: match_entity = [ entity for entity in self.entity_set if human_name.first == entity.human_name.first and human_name.last == entity.human_name.last ][0] except IndexError: match_entity = None if match_entity is None: self.create_entity(idx, human_name) else: self.entities_match[idx] = match_entity else: remaining_list.append((idx, human_name)) human_name_list = remaining_list # STEP 2 : # for each remaining human_names that contain at least first name and last name # -> for instance : Elizabeth Bennet # if there already exists an entity which has this first and last name: associate the human_name to this entity # else : create a new entity print( "Co-ref step 2 : associate character name that have just first name and last name to entity" ) remaining_list = [] for idx, human_name in tqdm(human_name_list): if human_name.first != "" and human_name.last != "": try: match_entity = [ entity for entity in self.entity_set if human_name.first == entity.human_name.first and human_name.last == entity.human_name.last ][0] except IndexError: match_entity = None if match_entity is None: self.create_entity(idx, human_name) else: self.entities_match[idx] = match_entity else: remaining_list.append((idx, human_name)) human_name_list = remaining_list # STEP 3 : # for each remaining human_names that contain a title and first name # -> for instance : Miss Bennet # if there already exists entities which contains this first name and has the same genre (ie: Elizabeth Bennet) # associate the human_name to the most common entity among those entities # else : create a new entity print( "Co-ref step 3 : associate character name that have just title and first name to entity" ) remaining_list = [] for idx, human_name in tqdm(human_name_list): if human_name.title != "" and human_name.first != "": possible_entities = [] for entity in self.entity_set: if entity.human_name.first == human_name.first: if self.genre_of( human_name ) == Genre.UKN or entity.genre == Genre.UKN: possible_entities.append(entity) else: if entity.genre == self.genre_of(human_name): possible_entities.append(entity) match_entity = self.most_frequent_entity(possible_entities) if match_entity is None: self.create_entity(idx, human_name) else: self.entities_match[idx] = match_entity else: remaining_list.append((idx, human_name)) human_name_list = remaining_list # STEP 4 : # for each remaining human_names that contain a title and last name # -> for instance : Mrs. Bennet # if there already exists entities which contains this last name and has the same genre (ie: Elizabeth Bennet) # associate the human_name to the most common entity among those entities # else : create a new entity print( "Co-ref step 4 : associate character name that have just title and last name to entity" ) remaining_list = [] for idx, human_name in tqdm(human_name_list): if human_name.title != "" and human_name.last != "": possible_entities = [] for entity in self.entity_set: if entity.human_name.last == human_name.last: if self.genre_of( human_name ) == Genre.UKN or entity.genre == Genre.UKN: possible_entities.append(entity) else: if entity.genre == self.genre_of(human_name): possible_entities.append(entity) match_entity = self.most_frequent_entity(possible_entities) if match_entity is None: self.create_entity(idx, human_name) else: self.entities_match[idx] = match_entity else: remaining_list.append((idx, human_name)) human_name_list = remaining_list # STEP 5 : # At this step, the human_name_list only contain first name # Note that this first could also corresponding to last_name, indeed both Duval or Alexandre will be parsed as # HumanName(first='Duval') , HumanName(first='Alexandre') by the HumanParser # # so for each of this human_name we look in the list of entities for the most common entities which contain print( "Co-ref step 5 : associate character name that have just first name or last name to entity" ) for idx, human_name in tqdm(human_name_list): if human_name.first == "": possible_entities = [ entity for entity in self.entity_set if entity.human_name.last == human_name.last or entity.human_name.first == human_name.last ] if human_name.last == "": possible_entities = [ entity for entity in self.entity_set if entity.human_name.first == human_name.first or entity.human_name.last == human_name.first ] match_entity = self.most_frequent_entity(possible_entities) if match_entity is None: self.create_entity(idx, human_name) else: self.entities_match[idx] = match_entity return self.entities_match
def latex(self): """Render latex template""" rc = self.rc if not rc.grants: raise RuntimeError( "Error: no grant specified. Please rerun specifying a grant") if isinstance(rc.grants, str): rc.grants = [rc.grants] if len(rc.grants) > 1: raise RuntimeError( "Error: more than one grant specified. Please rerun with" "only a single grant.") grant_id = rc.grants[0] grant = fuzzy_retrieval(self.gtx['grants'], ['_id', "alias", "name"], grant_id) grant_dates = get_dates(grant) # Convert Date Strings to Datetime Objects if rc.from_date: rp_start_date = date_parser.parse(rc.from_date).date() else: rp_start_date = grant_dates.get("begin_date") print( f"INFO: no begin-date specified. running report from the beginning " f"of the grant period ({rp_start_date})") if rc.to_date: rp_end_date = date_parser.parse(rc.to_date).date() else: rp_end_date = min([date.today(), grant_dates.get("end_date")]) print( "INFO: no end-date specified for the reporting period. Running " "report up to the earlier of the end of the grant, or today " f"({rp_end_date}).") report_dates = {'begin_date': rp_start_date, 'end_date': rp_end_date} print(f"INFO: generating report for grant {grant_id} for the period" f"from {rp_start_date} to {rp_end_date})") # Get prum associated to grant and active during reporting period # institutions_coll = [inst for inst in self.gtx["institutions"]] institutions_coll = self.gtx["institutions"] grant_prums = [ prum for prum in self.gtx['projecta'] if grant_id in prum.get('grants', []) and "checklist" not in prum.get("deliverable").get("scope") ] # for prum in self.gtx['projecta']: # if grant_name in prum['grants']: # begin_date = get_dates(prum).get('begin_date') # due_date = get_due_date(prum['deliverable']) # # if projectum was finished during reporting period or is still current # # some projectum don't have an "end date", but all projecta have a deliverable # # due_date # if (rp_start_date <= due_date <= rp_end_date and prum['status'] is "finished") or is_current(prum): # grant_prums.append(prum) # Get people associated with grant grant_prums_finished_this_period = [ prum for prum in grant_prums if is_current(report_dates, get_dates(prum).get('end_date')) ] grant_prum_leads = list(set([prum['lead'] for prum in grant_prums])) grant_prum_collaborators = list( set([ collab for prum in grant_prums for collab in prum.get('collaborators', []) ])) grant_prum_group_members = list( set([ grp_mbr for prum in grant_prums for grp_mbr in prum.get('group_members', []) ])) grant_people = grant_prum_leads # Accomplishments major_activities = [] significant_results = [] for prum in grant_prums: if prum['status'] == "finished": continue else: major_activities.append(prum) for prum in grant_prums_finished_this_period: significant_results.append(prum) # Opportunities for Training and Professional Development training_and_professional_development = [] # presentations for id in grant_people: training_and_professional_development.extend( filter_presentations(self.gtx["people"], self.gtx["presentations"], institutions_coll, id, types=["all"], since=rp_start_date, before=rp_end_date, statuses=["accepted"])) # thesis defendings # how do i access people.yml in rg-db-public vs the people.yml file in rg-db-group? # defended_theses = [] # for id in grant_people: # for prsn in self.gtx['people']: # if prsn["_id"] != id: # continue # else: # person = prsn # for education in person['education']: # edu_dates = get_dates(education) # if 'phd' in education['degree'].lower() and 'columbia' in education['institution'].lower() and \ # rp_start_date.year <= edu_dates.get('end_date', edu_dates['date']).year <= rp_end_date.year: # defended_theses.append(id) # Products # need rg-db-public's citation.yml # publications = filter_publications(self.gtx["citations"], ## set(grant_people), # since=rp_start_date, # before=rp_end_date) publications = [ publ for publ in self.gtx["citations"] if grant_id in publ.get("grant", "") ] for publ in publications: formatted_authors = [ HumanName(name).full_name for name in publ.get("authors", []) ] publ["authors"] = formatted_authors # Participants/Organizations participants = [] for person in self.gtx["people"]: months_on_grant, months_left = self.months_on( grant_id, person, rp_start_date, rp_end_date) if months_on_grant > 0: participants.append({ "name": person.get("name"), "email": person.get("email"), "position": person.get('position'), "months_on_grant": int(round(months_on_grant, 0)) }) collaborators = {} missing_contacts = [] for id in grant_prum_collaborators: for contact in self.gtx["contacts"]: if contact["_id"] == id: name = contact.get("name") aka = contact.get("aka") institution_id = contact.get("institution") institution = fuzzy_retrieval(institutions_coll, ["name", "aka", "_id"], institution_id) if institution: inst_name = institution.get("name") else: print( f"WARNING: institution {institution_id} not found " f"in institutions collection") inst_name = institution_id collaborators[id] = { "aka": aka, "name": name, "institution": inst_name } missing_contacts = [ id for id in grant_prum_collaborators if not collaborators.get(id) ] missing_contacts = list(set(missing_contacts)) for person_id in missing_contacts: print( f"WARNING: contact {person_id} not found in contacts collection" ) # Impacts begin_date_str = rp_start_date.isoformat() end_date_str = rp_end_date.isoformat() self.render( "grantreport.txt", f"{grant_id}_report_{begin_date_str}_{end_date_str}.txt", begin_date=begin_date_str, end_date=end_date_str, majorActivities=major_activities, significantResults=significant_results, trainingAndProfessionalDevelopment= training_and_professional_development, # defendedTheses=defended_theses, products=publications, grantPeople=grant_people, participants=participants, collaborators=collaborators, hline= "------------------------------------------------------------------------------" )
def scrub_name(name): return HumanName(demoji.replace(name.replace(',', ''), ''))
def get_last_name(author): author = HumanName(author.split("and")[0]) return author.last
def last_first(self): if not self.name: return '' return '{0.last}, {0.first} {0.middle}'.format(HumanName(self.name)).strip()
def _generate_lastName(self): tmp = extract(RULES["info"],self.sec,multi=True)[0] self.lastName = HumanName(extract("//strong/a/text()",str(etree.tostring(tmp)))).last
def test_capitalize_diacritics(self): hn = HumanName(u'matth\xe4us schmidt') hn.capitalize() self.m(unicode(hn), u'Matth\xe4us Schmidt', hn)
def _generate_lastName(self): if "name" in self.parse_data.keys(): if self.parse_data["name"]: self.lastName = HumanName(self.parse_data["name"]).last
def test123(self): hn = HumanName('Shirley Maclaine') hn.capitalize() self.m(str(hn), 'Shirley Maclaine', hn)
def extract_raw_org_names_from_name(name_raw): """ Finds raw org names like "B&W" in a name string, standarizes them (e.g. to "Brown & Williamson," and returns the name without that raw org name + extracted positions :param name_raw: str :param extract_orgs: bool :return: str (name_raw without the raw org name), list of str (extracted clean organization names) """ extracted_positions = [] for raw_org, clean_org in RAW_ORG_TO_CLEAN_ORG_DICT.items(): while True: search_hit = None # this is a bit of an ugly hack to get the last (rather than the first) search hit # for a string: we iterate over all matches and the last one gets stored in # search_hit for search_hit in re.finditer(r'\b' + raw_org + r'\b', name_raw): pass if not search_hit: break if len(raw_org) >= 3: name_raw = name_raw[0:search_hit.start()] + name_raw[search_hit.end():] if not clean_org == "@skip@": extracted_positions.append(clean_org) elif len(raw_org) == 2: name_raw_test = name_raw[0:search_hit.start()] + name_raw[search_hit.end():] # test if deleted, there exists first & middle name name = HumanName(name_raw_test) # if first & middle name do not exist after deletion, the deleted org might # actually be initials, so ignore the match if not name.first and not name.middle: break # last names without middle names ("TEMKO") get interpreted as first names # without last names. Skip those cases if not name.last: break # if not, do extract raw_org extracted_positions.append(clean_org) name_raw = name_raw_test name_raw = name_raw.strip(', ') # more adventurous: try to extract organizations we don't have in the dictionary # do this only if a) the name is currently not valid (i.e. it has strange characters like # commas in the last name) and b) extracting an org makes it valid, # e.g. 'HOLMAN RT, DEUEL CONFERENCE ON LIPIDS' if len(name_raw) > 0: first, middle, last, _ = Person.parse_raw_name(name_raw, 0, extract_orgs=False) if not Person(last=last, middle=middle, first=first).check_if_this_person_looks_valid(): search_hit = re.search(',.+$', name_raw) if search_hit: extracted_position = name_raw[search_hit.start():].strip(', ') name_raw_without_org = name_raw[0:search_hit.start()] + name_raw[ search_hit.end():] # if raw name becomes valid after extracting the org, then we add it to the orgs # otherwise, we skip it first, middle, last, _ = Person.parse_raw_name(name_raw_without_org, 0, extract_orgs=False) if Person(last=last, middle=middle, first=first).check_if_this_person_looks_valid(): extracted_positions.append(extracted_position) name_raw = name_raw_without_org name_raw = name_raw.strip(', ') return name_raw, extracted_positions
def get_names_and_genders_from_journals(): """ Creates a csv that identifies names that we need to clean by hand. :return: """ authors_counter = Counter() journalc = Counter() db = sqlite3.connect(Path('data', 'JSTOR_full_cleaned.db')) cur = db.cursor() cur2 = db.cursor() cur.execute('''select journal, ID_doi, ID_jstor from article_pub_info where article_type="research article" and year > 1950 and language="eng";''' ) rows = cur.fetchall() for article_id, (journal, ID_doi, ID_jstor) in enumerate(rows): print(article_id, len(rows)) if journal in GENERAL_JOURNALS: journalc[journal] += 1 if ID_doi: cur2.execute( f'SELECT name, surname, role FROM contributors WHERE ID_doi = "{ID_doi}"' ) elif ID_jstor: cur2.execute( f'SELECT name, surname, role FROM contributors WHERE ID_jstor = "{ID_jstor}"' ) else: raise ValueError("NO id for ", journal) article_authors = cur2.fetchall() for first_name, last_name, role in article_authors: # some last names contain commas, which trip up the gender guesser last_name = last_name.strip(',') authors_counter[(first_name, last_name)] += 1 authors = [] for author in authors_counter: first_name, last_name = author human_name = HumanName(f'{last_name}, {first_name}') guess_census = guess_gender_census(human_name) guess_first_middle_name_international = guess_gender_with_middle_name_and_international_names( human_name) human_check_necessary = True if (guess_census == guess_first_middle_name_international and (guess_census == 'male' or guess_census == 'female')): human_check_necessary = False authors.append({ 'first_name': first_name, 'last_name': last_name, 'count': authors_counter[author], 'prob_male_census': guess_gender_census(human_name, return_type='probability_male'), 'guess_census': guess_census, # 'guess_first_name_usa': guess_first_name_usa, 'guess_first_middle_name_international': guess_first_middle_name_international, 'human_check_necessary': human_check_necessary }) df = pd.DataFrame(authors) df.to_csv(Path('data', 'ambiguous_author_gender.csv'), encoding='utf8')
'suffix', ] for field in new_field_list: sql_query = ''' ALTER TABLE `%s_raw_contribs` ADD %s TEXT ''' % ( date_stamp, field) c.execute(sql_query) except: print 'fields already exist' sql_select_command = ''' SELECT rowid, name_of_contributor FROM `%s_raw_contribs` WHERE employer <> '' AND occupation <> '' ''' % date_stamp # every sqlite table automatically has a unique row id you can use execute_query = c.execute(sql_select_command) rows = execute_query.fetchall() for row in rows: row_id = row[0] name = row[ 1] # python lists start counting at 0 and the full name of the contributor is the second column in the sql select statement parsed_name = HumanName(name) sql_command = ''' UPDATE `%s_raw_contribs` SET title = '%s', fname = '%s', middle = '%s', lname = '%s', suffix = '%s' WHERE rowid = %s ''' % ( date_stamp, parsed_name.title, parsed_name.first, parsed_name.middle, parsed_name.last, parsed_name.suffix, row_id, ) c.execute(sql_command) sql_command = "SELECT rowid, name_of_contributor, title, fname, middle, lname, suffix FROM `%s_raw_contribs`;" % ( date_stamp)
def human_to_csl(name): """Convert HumanName to CSL-formatted JSON. Args: name : HumanName or str / unicode Returns: CSL-formatted JSON Examples: >>> csl = human_to_csl('Rafael Nadal') >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'} True >>> csl = human_to_csl(HumanName('Rafael Nadal')) >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'} True >>> csl = human_to_csl(HumanName('George HW de Bush')) >>> csl == {'given' : 'George H. W.', 'family' : 'de Bush'} True >>> csl = human_to_csl('Eisenhower, I') >>> csl == {'given' : 'I.', 'family' : 'Eisenhower'} True >>> csl = human_to_csl('Eisenhower, V') >>> csl == {'given' : 'V.', 'family' : 'Eisenhower'} True """ # Optionally convert to nameparser.HumanName if not isinstance(name, HumanName): name = HumanName(name) # Fix: nameparser treats HumanName('Eisenhower, I') as # {first : 'Eisenhower', suffix : 'I'} if re.search('^[IV]\.*$', name.suffix): name.last = name.first name.first = name.suffix name.suffix = '' # Initialize CSL data csl_data = {} # Append middle name to first if name.middle: name.first += ' ' + name.middle # Iterate over lookup fields for lookup in human_to_csl_map: # Get field and function field = human_to_csl_map[lookup]['field'] fun = human_to_csl_map[lookup].get('fun', I) # Get field from name value = getattr(name, field) # Skip if empty if not value: continue # Apply function value = fun(value) # Save to CSL data csl_data[lookup] = value # Return CSL data return csl_data
def parse_rss_2_cap(message): """ Parse RSS Feeds into the CAP Module """ db = current.db s3db = current.s3db table = s3db.msg_rss message_id = message.message_id record = db(table.message_id == message_id).select( table.id, table.channel_id, table.title, table.from_address, table.body, table.date, table.location_id, table.author, limitby=(0, 1)).first() if not record: return pstable = s3db.msg_parsing_status # not adding (pstable.channel_id == record.channel_id) to query # because two channels (http://host.domain/eden/cap/public.rss and # (http://host.domain/eden/cap/alert.rss) may contain common url # eg. http://host.domain/eden/cap/public/xx.cap pquery = (pstable.message_id == message_id) prows = db(pquery).select(pstable.id, pstable.is_parsed) for prow in prows: if prow.is_parsed: return alert_table = s3db.cap_alert info_table = s3db.cap_info # Is this an Update or a Create? # @ToDo: Use guid? # Use Body body = record.body or record.title query = (info_table.description == body) exists = db(query).select(info_table.id, limitby=(0, 1)).first() author = record.author if author: ptable = s3db.pr_person # https://code.google.com/p/python-nameparser/ from nameparser import HumanName name = HumanName(author) first_name = name.first middle_name = name.middle last_name = name.last query = (ptable.first_name == first_name) & \ (ptable.middle_name == middle_name) & \ (ptable.last_name == last_name) pexists = db(query).select(ptable.id, limitby=(0, 1)).first() if pexists: person_id = pexists.id else: person_id = ptable.insert(first_name=first_name, middle_name=middle_name, last_name=last_name) s3db.update_super(ptable, dict(id=person_id)) else: person_id = None if exists: # @ToDo: Use XSLT info_id = exists.id db(info_table.id == info_id).update( headline=record.title, description=body, created_on=record.date, #location_id = record.location_id, #person_id = person_id, ) else: # Embedded link url = record.from_address import_xml = s3db.resource("cap_alert").import_xml stylesheet = os.path.join(current.request.folder, "static", "formats", "cap", "import.xsl") try: file = fetch(url) except HTTPError as e: import base64 rss_table = s3db.msg_rss_channel query = (rss_table.channel_id == record.channel_id) channel = db(query).select(rss_table.date, rss_table.etag, rss_table.url, rss_table.username, rss_table.password, limitby=(0, 1)).first() username = channel.username password = channel.password if e.code == 401 and username and password: request = urllib2.Request(url) base64string = base64.encodestring("%s:%s" % (username, password)) request.add_header("Authorization", "Basic %s" % base64string) else: request = None try: file = urlopen(request).read() if request else fetch(url) except HTTPError as e: # Check if there are links to look into ltable = s3db.msg_rss_link query_ = (ltable.rss_id == record.id) & (ltable.deleted != True) rows_ = db(query_).select(ltable.type, ltable.url) url_format = "{uri.scheme}://{uri.netloc}/".format url_domain = url_format(uri=urlparse.urlparse(url)) for row_ in rows_: url = row_.url if url and row_.type == "application/cap+xml" and \ url_domain == url_format(uri=urlparse.urlparse(url)): # Same domain, so okey to use same username/pwd combination if e.code == 401 and username and password: request = urllib2.Request(url) request.add_header("Authorization", "Basic %s" % base64string) else: request = None try: file = urlopen( request).read() if request else fetch(url) except HTTPError as e: current.log.error( "Getting content from link failed: %s" % e) else: # Import via XSLT import_xml(StringIO(file), stylesheet=stylesheet, ignore_errors=True) else: # Import via XSLT import_xml(StringIO(file), stylesheet=stylesheet, ignore_errors=True) else: # Public Alerts # eg. http://host.domain/eden/cap/public/xx.cap # Import via XSLT import_xml(StringIO(file), stylesheet=stylesheet, ignore_errors=True) # No Reply return
def latex(self): """Render latex template""" for group in self.gtx["groups"]: grp = group["_id"] pi = fuzzy_retrieval(self.gtx["people"], ["aka", "name"], group["pi_name"]) pinames = pi["name"].split() piinitialslist = [i[0] for i in pinames] pi['initials'] = "".join(piinitialslist).upper() grants = merge_collections(self.gtx["proposals"], self.gtx["grants"], "proposal_id") for g in grants: print(g["_id"]) g['year'] = None g['month'] = None g['end_date'] = get_dates(g).get('end_date') g['begin_date'] = get_dates(g).get('begin_date', dt.date(1900, 1, 2)) for person in g["team"]: rperson = fuzzy_retrieval(self.gtx["people"], ["aka", "name"], person["name"]) if rperson: person["name"] = rperson["name"] if g.get('budget'): amounts = [i.get('amount') for i in g.get('budget')] g['subaward_amount'] = sum(amounts) current_grants = [dict(g) for g in grants if is_current(g)] current_grants, _, _ = filter_grants(current_grants, {pi["name"]}, pi=False, multi_pi=True) for g in current_grants: if g.get('budget'): amounts = [i.get('amount') for i in g.get('budget')] g['subaward_amount'] = sum(amounts) pending_grants = [ g for g in self.gtx["proposals"] if is_pending(g["status"]) ] for g in pending_grants: for person in g["team"]: rperson = fuzzy_retrieval(self.gtx["people"], ["aka", "name"], person["name"]) if rperson: person["name"] = rperson["name"] pending_grants, _, _ = filter_grants(pending_grants, {pi["name"]}, pi=False, multi_pi=True) print([g.get('begin_date') for g in pending_grants]) grants = pending_grants + current_grants for grant in grants: grant.update( award_start_date="{}/{}/{}".format( grant.get("begin_date").month, grant.get("begin_date").day, grant.get("begin_date").year, ), award_end_date="{}/{}/{}".format( grant.get("end_date").month, grant.get("end_date").day, grant.get("end_date").year, ), ) badids = [ i["_id"] for i in current_grants if not i.get('cpp_info').get('cppflag', "") ] iter = copy(current_grants) for grant in iter: if grant["_id"] in badids: current_grants.remove(grant) piname = HumanName(pi["name"]) outfile = "current-pending-{}-{}".format(grp, piname.last.lower()) print([grant["_id"] for grant in current_grants]) self.render( "current_pending.tex", outfile + ".tex", pi=pi, pending=pending_grants, current=current_grants, pi_upper=pi["name"].upper(), group=group, ) self.pdf(outfile)
def sender_name(self): name = ANGLE_BRACKETS_REGEX.sub('', self.sender) name = name.strip().replace('"', '') return unicode(HumanName(name))
def test_formating(self): hn = HumanName("Rev John A. Kenneth Doe III") hn.string_format = "{title} {first} {middle} {last} {suffix}" self.assertEqual(unicode(hn), "Rev John A. Kenneth Doe III") hn.string_format = "{last}, {title} {first} {middle}, {suffix}" self.assertEqual(unicode(hn), "Doe, Rev John A. Kenneth, III")
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Contact: [email protected] from nameparser import HumanName import codecs import textwrap authors = [] with codecs.open('authors.csv', 'r', encoding='utf-8') as namefile: for line in namefile: name, address = line.split(',') authors.append((HumanName(name), address)) authors = sorted(authors, key=lambda author: author[0].last) authors.insert(0, (HumanName("Michael R. Crusoe"), "*****@*****.**")) authors.append((HumanName("C. Titus Brown"), "*****@*****.**")) # print(authors) bibtex = ' author = \"' for tup in authors: name = tup[0] name.string_format = "{last}, {first} {middle} and" bibtex += str(name) + " " bibtex = bibtex[:-5] + '"' # remove last 'and' and close the quote
full_name_test = "" # Store problematic name: name = "Van Conway" # In original article, this is a first name and a last name. # let nameparser parse parsed = HumanName( name ) # look at how that turned out: print( "Parsed HumanName for " + name + ":" ) print( Person.HumanName_to_str( parsed ) ) # now, make a second HumanName instance. manual = HumanName() # look at how that turned out: print( "Empty HumanName?:" ) print( Person.HumanName_to_str( manual ) ) # override parsed values with correct name parts manual.first = "Van" manual.last = "Conway" # look at how that turned out: print( "after manual configuration:" ) print( Person.HumanName_to_str( manual ) ) # now, try some lookups
def extract_last_name(self): self.Xy["last_name"] = self.Xy.name.apply(lambda x: HumanName(x).last)
print "Now let's take care of those", train.Age.isnull().sum(), "null values" # In[30]: print "One idea would be to take the median age:", train.Age.median( ), "or mean:", train.Age.mean( ), "but I think we can get a clue from people's titles (ex Mr., Mrs.)" # First let's see what titles we have. # In[31]: titles = [] for name in train.Name: titles.append(HumanName(name).title) print set(titles) # The titles look good, expect there's an empty string, perhaps that's for the less common titles, but I feel pretty good about this range since it has covered the basicis. # Now let's make a new feature for these titles. # In[32]: train.Title = train.Name.map(lambda x: HumanName(x).title) # In[33]: print train[train.Title == ''].Name print train[train.Title == ''].Survived # These are the people with the 'empty' titles. Since there are only seven of them, and many of their titles are unique, I don't mind grouping them together into a 'uncommon title' group. Plus, they seem to follow the typical pattern of women survived and men died, so I do not expect any issues to arise in the machine learning section.
def test_capitalization_with_Mac_as_hyphenated_names(self): hn = HumanName('donovan mcnabb-smith') hn.capitalize() self.m(str(hn), 'Donovan McNabb-Smith', hn)
def extract_title(self): Xy['title'] = Xy.name.apply(lambda x: HumanName(x).title).replace( TITLE_TRANSLATOR).replace({'\.': ''}, regex=True)
def test_capitalize_title(self): hn = HumanName('lt. gen. john a. kenneth doe iv') hn.capitalize() self.m(str(hn), 'Lt. Gen. John A. Kenneth Doe IV', hn)
def clean_name(self): name = self.cleaned_data['name'].lower() name = HumanName(name) name.capitalize() return unicode(name)
def extract_title(self): self.Xy["title"] = ( Xy.name.apply(lambda x: HumanName(x).title) .replace(self.title_translator) .replace({"\.": ""}, regex=True) )
def parse_persname(persname, auth="", source=""): name, birth_date, death_date = extract_birth_death_dates(persname) birth_date, death_date = validate_dates(birth_date, death_date) dates_string = make_date_string(birth_date, death_date) name = HumanName(name) titles = ["sir", "mr", "mrs", "baron", "dame", "madame", "viscount", "conte"] numbers = ["II", "III"] title = name.title suffix = name.suffix number = u"" # check if the suffix should actually be a title if not title and any(suffix.lower().strip(". ") == title for title in titles): title = suffix.capitalize() if "mr" in title.lower() and not title.endswith("."): title += "." suffix = u"" # extract numbers from the suffix if suffix in numbers: number = suffix suffix = u"" # special cases cleanup if name.title == u"Royal": name.title = "" title = "" name.middle = name.first if not name.middle else "{} {}".format(u"Royal", name.middle) name.first = u"Royal" if name.title == u"Queen of Great": title = name.title + u" Britain" name.first = u"" if name.title == u"Lama": title = u"Dalai Lama XIV" name.first = u"" name.middle = u"" if name.title == u"Marquis": title = u"" name.first = u"Marquis" name.middle = u"W." if suffix == u"1941": birth_date = suffix suffix = u"" if suffix in [u"18", u"b."]: suffix = u"" if suffix == u"Jr": suffix += u"." if ", fl. 17th cent" in suffix: suffix = u"sieur de" dates_string = u"fl. 17th cent" rest_of_name = u"{0} {1}".format(name.first, name.middle).rstrip() if rest_of_name == u"Christella D. Personal journey through South Africa. 1991": rest_of_name = u"Christella D." # People with single-part names (like Keewaydinoquay) are mis-assigned. Have to fix those primary_name = name.last if rest_of_name and not primary_name: primary_name = rest_of_name rest_of_name = "" # create the parsed name dictionary name_parsed = {u"title": unicode(title), u"primary_name": unicode(primary_name), u"rest_of_name": rest_of_name, u"suffix": unicode(suffix), u"fuller_form": unicode(name.nickname), u"numbers": unicode(number), u"birth_date": unicode(birth_date), u"death_date": unicode(death_date), u"date_string": unicode(dates_string), u"authority_id": unicode(auth), u"source": unicode(source), u"name_order": u"inverted", u"sort_name_auto_generate": True} # remove empty fields for key, value in name_parsed.items(): if not value: del name_parsed[key] return name_parsed
csv_writer.writeheader() #read the extracted author-names from the clean data and convert data from list to string with open("author-names_extracted_modified.csv", 'r', encoding="latin") as f: for lines in f: list_string = lines #converting author list in string def listExtractedString(list_string): str1 = "" return (str1.join(list_string)) #saved the result of the author list to string convertion authorListtoString = listExtractedString(list_string) #print(authorListtoString) #CONSTANTS.string_format = "{first} {middle} {last} ({suffix})" name = HumanName(authorListtoString) first = name.as_dict()["first"] middle = name.as_dict()["middle"] last = name.as_dict()["last"] data = [{ 'first_name1': first.strip("'['").strip("]"), 'middle_name1': middle, 'last_name1': last.strip("']") }] for row in data: #print(row) csv_writer.writerow(row) csvfile.close() # applied nameparser on the metadata (ground-truth) and saved the result in metadata_author.csv file
def get_lname(somename): name = HumanName(somename) return name.last
def namer(field): #pre if type(field) == tuple: w_name = re.sub('[\t\r\n]', '', ", ".join([x.encode('ascii', 'ignore') for x in field])).upper() else: w_name = re.sub('[\t\r\n]', '', field.encode('ascii', 'ignore')).upper() if 'ANONYMOUS' not in w_name: if ' FORMER ' not in w_name: w_name = re.split(";", w_name)[0] else: w_name = re.split(";", w_name)[1] w_name = re.sub("(?<=[`'/+]) | (?=['`/+])", '', w_name) #6A, 4A-C out = HumanName(w_name) out.middle = re.sub("^[A-Z] |^[A-Z]\. ", '', out.middle) if " " in out.last: out.last = re.sub("^[A-Z] |^[A-Z]\. ", '', out.last) if re.sub("^[A-Z]\.|^[A-Z]", '', out.first) == '' and len(out.middle) != 0: out.first, out.middle = out.middle, "" else: out.first = re.sub("^[A-Z] |^[A-Z]\. ", '', out.first) #post if out.middle.startswith("FOR ") or out.middle.startswith("- "): #7A, 1B, 3E out.middle = "" if " FOR " in out.last: out.last = re.sub(" FOR .*", '', out.last) if len(out.last) == 0 and len(out.title) != 0: #9A if " " in out.first: out = HumanName(out.first) else: out.first, out.last = "", out.first if " AND " in out.middle or " & " in out.middle: out.last = re.split("( AND )|( & )", out.middle)[0] out.middle = "" if "AND" in out.last or "&" in out.last: if out.last.startswith("AND ") or out.last.startswith("& "): #3F out.last = HumanName(out.last).last elif " AND " in out.last or " & " in out.last: out.last = re.sub("( AND ).*|( & ).*", '', out.last) out.first = re.split("( AND )|&|/|\+", out.first)[0] out.last = re.split("/", out.last)[0].strip() if len(re.sub("[^A-Z]", '', out.first)) == 1 and " " in out.last: out.first = out.last.split(" ")[0] out.last = out.last.split(" ")[1] out.capitalize() first, last = out.first, out.last if len(out.middle) > 0: if re.sub("^[A-Z]\.|^[A-Z]", '', out.middle) == '': out.middle = "" elif first.endswith("-") or out.middle.startswith("-"): first += out.middle else: first += " %s" % out.middle #8A-B if len(out.suffix) > 0: last += " %s" % out.suffix #2A return (first, last) else: name = HumanName(w_name) return (name.first, name.last)
def get_author_info(ID_doi, ID_jstor): """ Gets authors by doi or jstor id. Returns the names of all authors as a joined string as well as the overall author genders overall author gender will be male, female, mixed, unknown :param ID_doi: :param ID_jstor: :return: >>> get_author_info('10.2307_1857439', None) ('Walter Goffart', 'male') """ db = sqlite3.connect(str(Path('data', 'JSTOR_full_cleaned.db'))) cur = db.cursor() if ID_doi: cur.execute( f'SELECT name, surname, role FROM contributors WHERE ID_doi = "{ID_doi}"' ) elif ID_jstor: cur.execute( f'SELECT name, surname, role FROM contributors WHERE ID_jstor = "{ID_jstor}"' ) else: raise ValueError(f"NO id for doi: {ID_doi}, jstor id: {ID_jstor}.") genders = set() names = [] for first_name, last_name, role in cur.fetchall(): last_name = last_name.strip(',') human_name = HumanName(f'{last_name}, {first_name}') # print(f"From SQL: last: {last_name}. first: {first_name}") gen = get_hand_coded_gender(human_name) genders.add(gen) if gen == 'n/a': NAS.append(f'{first_name} {last_name}') # gender_census = guess_gender_census(human_name) # gender_guess = guess_gender_with_middle_name_and_international_names(human_name) names.append(f'{first_name} {last_name}') # if gender_census == gender_guess and (gender_census == 'male' or gender_census == 'female'): # genders.add(gender_census) # else: # genders.add('unknown') if 'unknown' in genders: combined_gender = 'unknown' elif 'n/a' in genders: combined_gender = 'unknown' elif 'male' in genders and 'female' in genders: combined_gender = 'mixed' elif genders == {'male'}: combined_gender = 'male' elif genders == {'female'}: combined_gender = 'female' # if no authors, return None for author names and unknown for gender elif len(genders) == 0: return 'None', 'unknown' else: raise ValueError("How did you get here?", names, genders) combined_names = "; ".join(names) return combined_names, combined_gender