def get_pp_names(fn_field): """ Use probablepeople to extract firstname/surname from vCard 'fn' field. :param fn_field: the input vCard 'fn' field. :return: a namedtuple containing the first name and surname. >>> get_names('John Smith') Extracting data for John Smith Names(first_name='John', surname='Smith') """ first_name = None surname = None try: import probablepeople as pp # not python 2.6 compatible # Use probablepeople to tag the parts of the name. full_name_dict = pp.tag(fn_field)[0] if 'GivenName' in full_name_dict: # If probablepeople has successfully extracted the first name, # use it. first_name = full_name_dict['GivenName'] if 'Surname' in full_name_dict: # If probablepeople has successfully extracted the surname, # use it. surname = full_name_dict['Surname'] except (ImportError, SyntaxError, TypeError) as error: print(error) return NAMES(first_name, surname)
def read(fn_in): global tags, tagged_rows, initial_keys print 'Reading and parsing: %s' % fn_in with open(fn_in, 'rb') as incsvfile: reader = csv.DictReader(incsvfile, delimiter=',') counter = 0 for row in reader: # stop early for development #if counter > 1000: # break try: row['name'] = row['name'].decode('utf8') tagged = pp.tag(row['name']) # add t_ prefix to distinguish from original columns tagged0 = OrderedDict( ('t_' + k, v) for k, v in tagged[0].viewitems()) # add type tagged0['tag_type'] = tagged[1] # add to dictionary of input values row.update(tagged0) except pp.RepeatedLabelError: row['tag_type'] = 'RepeatedLabelError' except UnicodeEncodeError: #https://github.com/datamade/probablepeople/issues/54 row['tag_type'] = 'UnicodeEncodeError' [tags.add(t) for t in row.keys()] tagged_rows.append(row) counter = counter + 1 print 'Read {:,} rows'.format(counter) # preserve the original order of field names initial_keys = reader.fieldnames
def parse_name(string): # First try the pretrained CRF model try: results, class_type = probablepeople.tag(string) if class_type != 'Person': raise ValueError("Skipping ...") return None # Form full name given_name = "" if 'GivenName' in results: given_name = results['GivenName'] elif 'FirstInitial' in results: given_name = results['FirstInitial'] surname = "" if 'Surname' in results: surname = results['Surname'] elif 'LastInitial' in results: surname = results['Surname'] middle_name = "" if 'MiddelName' in results: middle_name = results['MiddleName'] elif 'MiddleInitial' in results: middle_name = results['MiddleInitial'] full_name = (given_name, middle_name, surname) except Exception as e: # If there are errors, try some rule-based models: print('CRF models cannot process this name {}' + string) results = HumanName(string) given_name, middle_name, surname = results.first, results.middle, results.last full_name = (given_name, middle_name, surname) return full_name
def parse_business_name(row, name_cols, strict=False, type='generic'): """Parses a Company name with probablepeople library Concatenates all the company name columns into a single string and then attempts to parse it into standardized name components and return a subset of the name parts that are useful when comparing Contacts and Accounts. This process eliminates notes and other non-name text from dirty data. Args: row (pd.Series): A record name_cols (list): A list of column names in the record, in order, that when concatenated comprise a company/business name strict (boolean, optional): Whether or not to raise a RepeatedLabelError when parsing, if False, the last value of the repeated labels will be used for the parse type (str): Which probableparser to use: 'generic', 'person' or 'company' Returns: biz_name (str or np.nan): Filtered and standardized company name Example: Add a new column to a pd.DataFrame with a clean company name >>> from mergepurge import clean >>> df['clean_company'] = \ ... df.apply(clean.parse_business_name, axis=1, name_cols=['Account Name'], strict=False) """ row = row.fillna('') concat = [] for col in name_cols: concat.append(str(row.get(col, ''))) concat = ' '.join(concat) # pre-processing (no category for non-names to train parserator with) cleaned = re.sub(r'(not\s*available|not\s*provided|n/a)', '', concat, flags=re.IGNORECASE) try: parsed = probablepeople.tag( cleaned, type) # type='company'? general parser works better except probablepeople.RepeatedLabelError as e: if strict: raise e problem_key, problem_vals, parsed = find_repeated_label(cleaned) parsed = (parsed, '') # filter out other name components that are bad for matching (e.g. too generic: llc) rebuilt_name = [] for (k, v) in parsed[0].items(): if k in KEEPERS: rebuilt_name.append(v) biz_name = ' '.join(rebuilt_name) if pd.isnull(biz_name): return np.nan return biz_name
def clean_name(full_name): """ This function takes an text address (street address, city, state, zip), tokenizes it with the usaddress library, and runs basic search and replace to help standardize the address """ try: original_name = full_name.upper().replace(' ',', ') # Make edits to original_name before calling probablepeople probablepeople_name = probablepeople.tag(original_name) #print 'probable name:', probablepeople_name name_dict = dict(probablepeople_name[0]) name_type = probablepeople_name[1] name_dict['name_type'] = name_type name_dict['original_name'] = original_name # Remove trailing '.' from any abbreviations for field in people_fields_to_strip_periods: if field in name_dict: name_dict[field] = name_dict[field].strip('.') #print 'probable name dict:', name_dict return name_dict except Exception as e: print 'probablepeople ERROR:' print e return e
def parse_contact_name(row, name_cols, strict=False, type='person'): """Parses a person's name with probablepeople library Concatenates all the contact name columns into a single string and then attempts to parse it into standardized name components and return a subset of the name parts that are useful for comparing contacts. This process eliminates notes and other non-name text from dirty data. Args: row (pd.Series): A record name_cols (list): A list of column names in the record, in order, that when concatenated comprise a person's name strict (boolean, optional): Whether or not to raise a RepeatedLabelError when parsing, if False, the last value of the repeated labels will be used for the parse type (str): Which probableparser to use: 'generic', 'person' or 'company' Returns: A subset (tuple of str, or np.nan) of the standardized name components, namely: (title, first, last, full_name) """ row = row.fillna('') concat = [] for col in name_cols: concat.append(row.get(col, '')) concat = ' '.join(concat) cleaned = re.sub(r'(not\s*available|not\s*provided|n/a)', '', concat, flags=re.IGNORECASE) try: parsed = probablepeople.tag(cleaned, type) except probablepeople.RepeatedLabelError as e: if strict: raise e problem_key, problem_vals, parsed = find_repeated_label(cleaned) parsed = (parsed, '') title = parsed[0].get('PrefixOther', np.nan) first = parsed[0].get('GivenName', np.nan) last = parsed[0].get('Surname', np.nan) try: full_name = first + ' ' + last except TypeError as e: full_name = np.nan return title, first, last, full_name
def get_names(fn): """ Extract the first name and surname from a vCard 'fn' field. :param fn: the input vCard 'fn' field. :return: a namedtuple containing the first name and surname. >>> get_names('John Smith') Names(first_name='John', surname='Smith') """ first_name = None surname = None try: import probablepeople as pp # not python 2.6 compatible # Use probablepeople to tag the parts of the name. full_name_dict = pp.tag(fn)[0] if 'GivenName' in full_name_dict: # If probablepeople has successfully extracted the first name, # use it. first_name = full_name_dict['GivenName'] if 'Surname' in full_name_dict: # If probablepeople has successfully extracted the surname, use it. surname = full_name_dict['Surname'] except ImportError: pass except SyntaxError: pass fn_split = fn.split(" ") if first_name is None: # If we can't get first name from probablepeople, assume it's the # first part of the string. first_name = fn_split[0] if surname is None: # If we can't get surname from probablepeople, assume it's the # second part of the string, if that exists. if len(fn_split) > 1: surname = fn_split[1] else: surname = "" names = Names(first_name, surname) return names
def _lookup_rsvps(self, rsvp): if self.private: yield [ "User Id", "Username", "Full Name", "First Name", "Last Name", "Email", "Guests", ] else: yield [ "Full Name", "First Name", "Last Name", "Guests", ] for item in rsvp: first_name = last_name = full_name = "" if not item.name: # if no name is defined and there is a db record for this user if item.user: # lookup the user's name from the db first_name = item.user.first_name last_name = item.user.last_name try: full_name = item.user.profile.display_name except Exception: logger.exception("Unable to access user profile.") else: full_name = item.name try: # try to parse out the user first/last name parsed = probablepeople.tag(full_name) first_name = parsed[0].get("GivenName") last_name = parsed[0].get("Surname") except Exception: logger.exception("unable to parse person %s", full_name) if self.private: row = [ item.user.id if item.user else "", item.user.username if item.user else "", full_name, first_name, last_name, item.email, item.guests ] else: row = [full_name, first_name, last_name, item.guests] row = [unicode_convert(x) for x in row] yield row
def display_name_to_rsvps(apps, schema_editor): if not has_probablepeople: return None UserProfile = apps.get_model('profiles', 'UserProfile') RSVP = apps.get_model('meetings', 'RSVP') users = UserProfile.objects.all() for user in users: parsed = probablepeople.tag(user.display_name) first_name = parsed[0].get('GivenName', '').lower() last_name = parsed[0].get('Surname', '').lower() rsvps = RSVP.objects.filter(user=user.pk) for rsvp in rsvps: rsvp.first_name = first_name rsvp.last_name = last_name rsvp.save()
def post(self): args = partyTypeParser.parse_args() partyNameText = args['partyName'] resp = pp.tag(partyNameText) type = resp[-1] if type in ['Person']: return jsonify(partyName=partyNameText, partyTypeConfidence='High', partyType=type) elif type in ['Corporation']: return jsonify(partyName=partyNameText, partyTypeConfidence='High', partyType=type) else: return jsonify(partyName=partyNameText, partyTypeConfidence='Low', partyType='Person')
def get_names(name_str: str) -> Tuple[List[Name], Optional[str]]: if not name_str: return [], None name_str = " ".join([ component.title() for component in name_str.split() if component.lower() not in remove_list ]) try: tagged, category = tag(name_str) if category == CORP: return [Name(entity_name=name_str)], category elif category == PERSON: return tagged_name_retrieve(tagged), PERSON else: # Household (ie: multiple names) should be the only other option return parsed_name_retrieve(parse(name_str)), category except RepeatedLabelError as e: tag_list = e.parsed_string # this is actually the same as parse(<name>) return parsed_name_retrieve(tag_list), None
def _lookup_rsvps(self, rsvp): if self.private: yield ["User Id", "Username", "Full Name", "First Name", "Last Name", "Email", "Guests", ] else: yield ["Full Name", "First Name", "Last Name", "Guests", ] for item in rsvp: first_name = last_name = full_name = "" if not item.name: # if no name is defined and there is a db record for this user if item.user: # lookup the user's name from the db first_name = item.user.first_name last_name = item.user.last_name try: full_name = item.user.profile.display_name except Exception: logger.exception("Unable to access user profile.") else: full_name = item.name try: # try to parse out the user first/last name parsed = probablepeople.tag(full_name) first_name = parsed[0].get("GivenName") last_name = parsed[0].get("Surname") except Exception: logger.exception("unable to parse person %s", full_name) if self.private: row = [item.user.id if item.user else "", item.user.username if item.user else "", full_name, first_name, last_name, item.email, item.guests] else: row = [ full_name, first_name, last_name, item.guests] row = [unicode_convert(x) for x in row] yield row
def get_candidate_fields(self, raw_result): fields = self._get_fields(raw_result, candidate_fields) full_name = raw_result.full_name.strip() if full_name.lower() in ['yes', 'no']: fields = self.get_judge_candidate_fields(raw_result) return fields if full_name.lower() in ['no candidate', 'candidate withdrew']: fields['full_name'] = None return fields try: name_parts, name_type = pp.tag(full_name) if name_type != 'Person': print "***************************" print "NOT A PERSON:", fields['full_name'] print "fields:", fields print "tagged name:", name_parts print "***************************" fields['full_name'] = full_name return fields fields['given_name'] = name_parts.get('GivenName') fields['family_name'] = name_parts.get('Surname') if 'SuffixGenerational' in name_parts: fields['suffix'] = name_parts['SuffixGenerational'] if 'Nickname' in name_parts: fields['additional_name'] = name_parts['Nickname'] fields['full_name'] = full_name except pp.RepeatedLabelError: print "***************************" print "UNABLE TO TAG:", full_name print "***************************" fields['full_name'] = full_name return fields
def _valid_name(self, possible_name): """ Check with probable people if possible_name is a valid person name""" try: parsed_name = pp.tag(possible_name) if parsed_name[1] != 'Person': return False, '' name_dict = parsed_name[0] if not ('GivenName' in name_dict and 'Surname' in name_dict): return False, '' fragments = [] for fragment_type, fragment in name_dict.items(): fragments.append(fragment.capitalize()) return True, ' '.join(fragments) except (pp.RepeatedLabelError, UnicodeEncodeError): return False, '' pass # Just ignore the name on errors
def post(self): args = partyName.parse_args() partyNameText = args['partyName'] resp0 = pp.tag(partyNameText) type = resp0[-1] finalResponse = dict(resp0[0]) partyType = {} if type in ['Person']: partyType['partyTypeConfidence'] = 'High' partyType['partyType'] = type elif type in ['Corporation']: partyType['partyTypeConfidence'] = 'High' partyType['partyType'] = type elif type in ['Household']: partyType['partyTypeConfidence'] = 'Medium' partyType['partyType'] = type else: partyType['partyTypeConfidence'] = 'Low' partyType['partyType'] = 'Person' return jsonify(submittedPartyName=partyNameText, parsedParty=finalResponse, partyType=partyType)
def normalize_name(name): """ Normalize a name for sorting. This uses two powerful python libraries for differing reasons. `probablepeople` contains a discriminator between company and person names. This is used to determine whether to parse into last, first, middle or to leave the name alone. However, the actual name parser in `probablepeople` is unnecessarily complex, so that strings that it determines to be human names are parsed instead by `nameparser`. """ sname = name.encode('utf-8').strip() # remove spaces try: # probablepeople doesn't understand utf-8 encoding. Hand it pure unicode. _, type = probablepeople.tag(name) # discard parser result except probablepeople.RepeatedLabelError: # if it can't understand the name, punt return sname if type == 'Corporation': return sname # do not parse and reorder company names # treat anything else as a human name nameparts = HumanName(sname) normalized = nameparts.last.capitalize() if nameparts.suffix: normalized = normalized + ' ' + nameparts.suffix normalized = normalized + ',' if nameparts.title: normalized = normalized + ' ' + nameparts.title if nameparts.first: normalized = normalized + ' ' + nameparts.first.capitalize() if nameparts.middle: normalized = ' ' + normalized + ' ' + nameparts.middle.capitalize() return normalized.strip()
def normalize_name(name): """ Normalize a name for sorting. This uses two powerful python libraries for differing reasons. `probablepeople` contains a discriminator between company and person names. This is used to determine whether to parse into last, first, middle or to leave the name alone. However, the actual name parser in `probablepeople` is unnecessarily complex, so that strings that it determines to be human names are parsed instead by `nameparser`. """ sname = name.strip() # remove spaces try: _, type = probablepeople.tag(sname) # discard parser result except probablepeople.RepeatedLabelError: # if it can't understand the name, punt return sname if type == 'Corporation': return sname # do not parse and reorder company names # treat anything else as a human name nameparts = HumanName(sname) normalized = nameparts.last.capitalize() if nameparts.suffix: normalized = normalized + ' ' + nameparts.suffix normalized = normalized + ',' if nameparts.title: normalized = normalized + ' ' + nameparts.title if nameparts.first: normalized = normalized + ' ' + nameparts.first.capitalize() if nameparts.middle: normalized = ' ' + normalized + ' ' + nameparts.middle.capitalize() return normalized.strip()
def extract_entity(text): (tags, entity_type) = pp.tag(text) # fix the schema output output = {'type': '', 'data': {}} # person = {'type' : 'Person', 'data': {prefix': '', 'first_name': '', 'last_name': '', 'nick_name': ''}} # organization = {'type' : 'Organization', 'data': {name': '', 'legal_type' : ''}} valid_corporate_keys = ['CorporationName', 'CorporationLegalType'] valid_person_keys = ['PrefixMarital', 'GivenName', 'Surname', 'Nickname'] for key in tags: if entity_type == 'Corporation' and key in valid_corporate_keys: output['type'] = 'Organization' if key == 'CorporationName': output['data']['name'] = tags[key] if key == 'CorporationLegalType': output['data']['legal_type'] = tags[key] if entity_type == 'Person' and key in valid_person_keys: output['type'] = 'Person' if key == 'PrefixMarital': output['data']['prefix'] = tags[key] if key == 'GivenName': output['data']['first_name'] = tags[key] if key == 'Surname': output['data']['last_name'] = tags[key] if key == 'Nickname': output['data']['nick_name'] = tags[key] return output
def export(self, property_state): """ Export HPXML file from an existing HPXML file (from import) merging in the data from property_state :param property_state: object, PropertyState to merge into HPXMLs :return: string, as XML """ if not property_state: f = BytesIO() self.tree.write(f, encoding='utf-8', pretty_print=True, xml_declaration=True) return f.getvalue() if self.tree is None: tree = objectify.parse(os.path.join(here, 'schemas', 'blank.xml'), parser=hpxml_parser) root = tree.getroot() else: root = deepcopy(self.root) bldg = self._get_building( property_state.extra_data.get('hpxml_building_id'), start_from=root) for pskey, xml_loc in self.HPXML_STRUCT.items(): value = getattr(property_state, pskey) el = self.xpath(xml_loc['path'], start_from=bldg, only_one=True) if pskey == 'energy_score': continue if value is None and self.tree is None: el.getparent().remove(el) if value is None or el is None: continue # set the value to magnitude if it is a quantity if isinstance(value, ureg.Quantity): value = value.magnitude setattr(el.getparent(), el.tag[el.tag.index('}') + 1:], str(value) if not isinstance(value, basestring) else value) E = objectify.ElementMaker(annotate=False, namespace=self.NS, nsmap={None: self.NS}) # Owner Information owner = self.xpath(( '//h:Customer/h:CustomerDetails/h:Person' '[not(h:IndividualType) or h:IndividualType = "owner-occupant" or h:IndividualType = "owner-non-occupant"]' ), start_from=root) if len(owner) > 0: owner = owner[0] else: customer = E.Customer( E.CustomerDetails( E.Person(E.SystemIdentifier(id='person1'), E.Name()))) root.Building.addprevious(customer) owner = customer.CustomerDetails.Person # Owner Name if property_state.owner is not None: try: owner_name, name_type = pp.tag(property_state.owner, type='person') except pp.RepeatedLabelError: pass else: if name_type.lower() == 'person': owner.Name.clear() if 'PrefixMarital' in owner_name or 'PrefixOther' in owner_name: owner.Name.append( E.PrefixName(' '.join([ owner_name.get('Prefix' + x, '') for x in ('Marital', 'Other') ]).strip())) if 'GivenName' in owner_name: owner.Name.append(E.FirstName(owner_name['GivenName'])) elif 'FirstInitial' in owner_name: owner.Name.append( E.FirstName(owner_name['FirstInitial'])) else: owner.Name.append(E.FirstName()) if 'MiddleName' in owner_name: owner.Name.append( E.MiddleName(owner_name['MiddleName'])) elif 'MiddleInitial' in owner_name: owner.Name.append( E.MiddleName(owner_name['MiddleInitial'])) if 'Surname' in owner_name: owner.Name.append(E.LastName(owner_name['Surname'])) elif 'LastInitial' in owner_name: owner.Name.append(E.LastName( owner_name['LastInitial'])) else: owner.Name.append(E.LastName()) if 'SuffixGenerational' in owner_name or 'SuffixOther' in owner_name: owner.Name.append( E.SuffixName(' '.join([ owner_name.get('Suffix' + x, '') for x in ('Generational', 'Other') ]).strip())) # Owner Email if property_state.owner_email is not None: new_email = E.Email(E.EmailAddress(property_state.owner_email), E.PreferredContactMethod(True)) if hasattr(owner, 'Email'): if property_state.owner_email not in owner.Email: owner.append(new_email) else: owner.append(new_email) # Owner Telephone if property_state.owner_telephone is not None: insert_phone_number = False if hasattr(owner, 'Telephone'): if property_state.owner_telephone not in owner.Telephone: insert_phone_number = True else: insert_phone_number = True if insert_phone_number: new_phone = E.Telephone( E.TelephoneNumber(property_state.owner_telephone), E.PreferredContactMethod(True)) inserted_phone_number = False for elname in ('Email', 'extension'): if hasattr(owner, elname): getattr(owner, elname).addprevious(new_phone) inserted_phone_number = True break if not inserted_phone_number: owner.append(new_phone) # Owner Address try: address = owner.getparent().MailingAddress except AttributeError: owner.getparent().Person[-1].addnext(E.MailingAddress()) address = owner.getparent().MailingAddress address.clear() if property_state.owner_address is not None: address.append(E.Address1(property_state.owner_address)) if property_state.owner_city_state is not None: city_state, _ = usadd.tag(property_state.owner_city_state) address.append(E.CityMunicipality(city_state.get('PlaceName', ''))) address.append(E.StateCode(city_state.get('StateName', ''))) if property_state.owner_postal_code is not None: address.append(E.ZipCode(property_state.owner_postal_code)) # Building Certification / Program Certificate program_certificate_options = [ 'Home Performance with Energy Star', 'LEED Certified', 'LEED Silver', 'LEED Gold', 'LEED Platinum', 'other' ] if property_state.building_certification is not None: try: root.Project except AttributeError: root.Building[-1].addnext( E.Project( E.BuildingID(id=bldg.BuildingID.get('id')), E.ProjectDetails( E.ProjectSystemIdentifiers( id=bldg.BuildingID.get('id'))))) new_prog_cert = E.ProgramCertificate( property_state.building_certification if property_state. building_certification in program_certificate_options else 'other') try: root.Project.ProjectDetails.ProgramCertificate except AttributeError: for elname in ('YearCertified', 'CertifyingOrganizationURL', 'CertifyingOrganization', 'ProgramSponsor', 'ContractorSystemIdentifiers', 'ProgramName', 'ProjectSystemIdentifiers'): if hasattr(root.Project.ProjectDetails, elname): getattr(root.Project.ProjectDetails, elname).addnext(new_prog_cert) break else: if property_state.building_certification not in root.Project.ProjectDetails.ProgramCertificate: root.Project.ProjectDetails.ProgramCertificate[-1].addnext( new_prog_cert) # Energy Score energy_score_type_options = ['US DOE Home Energy Score', 'RESNET HERS'] bldg_const = bldg.BuildingDetails.BuildingSummary.BuildingConstruction if property_state.energy_score: energy_score_type = property_state.extra_data.get( 'energy_score_type') try: found_energy_score = False for energy_score_el in bldg_const.EnergyScore: if energy_score_type in (energy_score_el.ScoreType, getattr(energy_score_el, 'OtherScoreType', None)): found_energy_score = True break if not found_energy_score: energy_score_el = E.EnergyScore() bldg_const.EnergyScore[-1].addnext(energy_score_el) except AttributeError: energy_score_el = E.EnergyScore() try: bldg_const.extension.addprevious(energy_score_el) except AttributeError: bldg_const.append(energy_score_el) if energy_score_type in energy_score_type_options: energy_score_el.ScoreType = energy_score_type else: energy_score_el.ScoreType = 'other' energy_score_el.OtherScoreType = energy_score_type energy_score_el.Score = property_state.energy_score # Serialize tree = etree.ElementTree(root) objectify.deannotate(tree, cleanup_namespaces=True) f = BytesIO() tree.write(f, encoding='utf-8', pretty_print=True, xml_declaration=True) return f.getvalue()
def test_basic(self): tagged, name_type = tag("Bob Belcher") assert name_type == "Person" assert "Bob" == tagged["GivenName"] assert "Belcher" == tagged["Surname"]
def get_contest_args(self, chicago_args, position, seen_ballot_measure): # load known offices # detect judge races & ballot initiatives known_offices = [ # national 'president of the united states', 'president and vice president of the united states', 'pres and vice pres', 'president, u.s.', 'senator, u.s.', 'united states senator', 'u.s. senator', 'u.s. representative', 'representative in congress', 'rep. in congress', # state 'governor', 'lieutenant governor', 'governor & lieutenant governor', 'governor and lieutenant governor', 'secretary of state', 'attorney general', 'state\'s attorney', 'comptroller', 'treasurer', 'state senator', 'state representative', 'rep. in general assembly', 'rep. in gen. assembly', # county 'commissioner', 'board president', 'president cook county board comm', 'clerk', 'sheriff', 'treasurer', 'assessor', 'commissioner, county board', 'board of review', 'recorder of deeds', 'supreme court', 'appellate court', 'apellate court', 'judge, cook county circuit', 'circuit court', 'circuit couut', 'subcircuit', # city 'mayor', 'alderman', 'committeeman', ] offices_to_skip = [ 'ballots cast', 'registered voters', 'amendment', 'national convention', 'natl. convention', 'delegate natl', 'delegates natl', 'state central committeeman', 'state central', ] chicago_args['office'] = position.lower() for office_substring in offices_to_skip: if office_substring in position.lower(): return None, None for office_substring in known_offices: if office_substring in position.lower(): is_ballot_measure = False if 'retain' in position.lower(): chicago_args['is_retention'] = True return is_ballot_measure, chicago_args if not seen_ballot_measure: # at this point, an office is none of the above try: tokens, name_type = pp.tag(position.lower()) if name_type == 'Person': chicago_args['is_retention'] = True is_ballot_measure = False return is_ballot_measure, chicago_args else: chicago_args['is_ballot_measure'] = True is_ballot_measure = True return is_ballot_measure, chicago_args except pp.RepeatedLabelError: print "REPEATED LABEL ERROR" return None, None else: chicago_args['is_ballot_measure'] = True is_ballot_measure = True return is_ballot_measure, chicago_args
def normalize_name(name): """ Normalize a name for sorting and indexing. This uses two powerful python libraries for differing reasons. `probablepeople` contains a discriminator between company and person names. This is used to determine whether to parse into last, first, middle or to leave the name alone. However, the actual name parser in `probablepeople` is unnecessarily complex, so that strings that it determines to be human names are parsed instead by the simpler `nameparser`. """ sname = name.strip() # remove leading and trailing spaces # Recognizer tends to mistake concatenated initials for Corporation name. # Pad potential initials with spaces before running recognizer # For any character A-Z followed by "." and another character A-Z, add a space after the first. # (?=[A-Z]) means to find A-Z after the match string but not match it. nname = re.sub("(?P<thing>[A-Z]\\.)(?=[A-Z])", "\\g<thing> ", sname) try: # probablepeople doesn't understand utf-8 encoding. Hand it pure unicode. _, type = probablepeople.tag(nname) # discard parser result except probablepeople.RepeatedLabelError: # if it can't understand the name, it's foreign type = 'Unknown' if type == 'Corporation': return sname # do not parse and reorder company names # special case for capitalization: flag as corporation if (adjacent_caps.match(sname)): return sname # treat anything else as a human name nameparts = HumanName(nname) normalized = "" if nameparts.last: normalized = nameparts.last if nameparts.suffix: if not normalized: normalized = nameparts.suffix else: normalized = normalized + ' ' + nameparts.suffix if normalized: normalized = normalized + ',' if nameparts.title: if not normalized: normalized = nameparts.title else: normalized = normalized + ' ' + nameparts.title if nameparts.first: if not normalized: normalized = nameparts.first else: normalized = normalized + ' ' + nameparts.first if nameparts.middle: if not normalized: normalized = nameparts.middle else: normalized = ' ' + normalized + ' ' + nameparts.middle return normalized.strip()
if row_dict[contributor_name_field] == 'Total of Contributions not exceeding $100': is_annonymous = 1 else: full_name = contributor_name = row_dict[contributor_name_field] bad_name = False bad_addy = False contributor_name = row_dict[contributor_name_field] #print('contributor_name_field:', contributor_name) try: probablepeople_name = probablepeople.tag(contributor_name) #print('probablepeople_name:', probablepeople_name) name_type = probablepeople_name[1] name_dict = dict(probablepeople_name[0]) #print('name_dict:', name_dict) except Exception as e: name_type = 'Unknown' name_dict = {'CorporationName': contributor_name} if type(name_dict) is not dict: print('name ERROR:') bad_name = True
def parse_name(self, nameString): #--notes: clean name has already been done #-- sorry names will be forced upper case #-- returns a dictionary of names #--remove in garbage expressions in the string nameString = nameString.upper() for garbageValue in self.variantData['GARBAGE_VALUES']: if garbageValue in nameString: self.updateStat('GARBAGE_IN_NAMES', garbageValue, nameString) nameString = nameString.replace(garbageValue, '').strip() newString = nameString primaryNameTokens = [] secondaryNameTokens = [] referenceNameTokens = [] #--remove tokens in parenthesis groupedStrings = re.findall('\(.*?\)', newString) for groupedString in groupedStrings: self.updateStat('GROUPED_STRINGS', '()', newString + ' | ' + groupedString) newString = newString.replace(groupedString, '') referenceNameTokens.append(groupedString) #--split the name theToken = None split = 0 for token in newString.replace('.', ' ').replace(',', ' ').replace( '-', ' - ').replace('/', ' / ').replace(';', ' ; ').upper().split(): if split == 1: secondaryNameTokens.append(token) elif split == 2: referenceNameTokens.append(token) elif token in self.variantData['NAME_SPLIT_TOKENS']: #--token is skipped split = 1 theToken = token elif token in self.variantData['NAME_ENDER_TOKENS']: primaryNameTokens.append(token) split = 2 theToken = token else: primaryNameTokens.append(token) primaryNameStr = ' '.join(primaryNameTokens) secondaryNameStr = ' '.join(secondaryNameTokens) referenceNameStr = ' '.join(referenceNameTokens) if secondaryNameStr: self.updateStat( 'NAME_SPLITERS', theToken, nameString + ' -> ' + primaryNameStr + ' | ' + secondaryNameStr) if referenceNameStr and split == 2: self.updateStat( 'NAME_ENDERS', theToken, nameString + ' -> ' + primaryNameStr + ' | ' + referenceNameStr) #--probable people parser if pp: #--pp.tag(name_str) # expected output: (OrderedDict([('PrefixMarital', 'Mr'), ('GivenName', 'George'), ('Nickname', '"Gob"'), ('Surname', 'Bluth'), ('SuffixGenerational', 'II')]), 'Person') #--pp.tag(corp_str) # expected output: (OrderedDict([('CorporationName', 'Sitwell Housing'), ('CorporationLegalType', 'Inc')]), 'Corporation') #--PrefixMarital #--PrefixOther #--GivenName #--FirstInitial #--MiddleName #--MiddleInitial #--Surname #--LastInitial #--SuffixGenerational #--SuffixOther #--Nickname #--And #--CorporationName #--CorporationNameOrganization #--CorporationLegalType #--CorporationNamePossessiveOf #--ShortForm #--ProxyFor #--AKA try: taggedName, nameType = pp.tag(primaryNameStr) isOrganization = False if nameType == 'Person' else True self.updateStat('ProbablePeople', nameType, primaryNameStr) except: isOrganization = self.is_organization_name(primaryNameStr) #--home grown parser else: isOrganization = self.is_organization_name(primaryNameStr) if isOrganization: primaryNameOrg = primaryNameStr secondaryNameOrg = secondaryNameStr primaryNameFull = '' secondaryNameFull = '' else: primaryNameOrg = '' secondaryNameOrg = '' primaryNameFull = primaryNameStr secondaryNameFull = secondaryNameStr nameList = [] nameList.append({'IS_ORGANIZATION': True}) nameList.append({'PRIMARY_NAME_ORG': primaryNameOrg}) nameList.append({'SECONDARY_NAME_ORG': secondaryNameOrg}) nameList.append({'PRIMARY_NAME_FULL': primaryNameFull}) nameList.append({'SECONDARY_NAME_FULL': secondaryNameFull}) nameList.append({'REFERENCE_NAME': referenceNameStr}) return nameList
def tagger(self, field): return probablepeople.tag(field)
from sqlalchemy import create_engine import csv import probablepeople as pp conn = create_engine("postgres://brian@localhost:5432/acris") c = conn.raw_connection().cursor() c.execute("select distinct name from parties") #setup output csvfile = open("out.csv", "w+", newline='') csvwriter = csv.writer(csvfile) keys_of_interest = ['GivenName', 'Surname'] all_keys = set() for one in c: name, = one out = [name] try: parsed, t = pp.tag(name) out.append(t) for key in keys_of_interest: if key in parsed: out.append(parsed[key]) else: out.append('') for key in parsed: all_keys.add(key) #now write except: pass csvwriter.writerow(out) #now let's track what we might get out print("Done! Here's what else we could have extracted") print(all_keys)
out_fn = '../data/cms-physician-permutation-tagged.csv' tagged_rows = [] tags = set() print 'Reading and parsing' with open(in_fn, 'rb') as incsvfile: reader = csv.DictReader(incsvfile, delimiter=',') counter = 0 for row in reader: # stop early for development #if counter > 1000: # break try: tagged = pp.tag(row['name']) # add t_ prefix to distinguish tagged0 = OrderedDict( ('t_' + k, v) for k, v in tagged[0].viewitems()) # add type tagged0['tag_type'] = tagged[1] # add to dictionary of input values row.update(tagged0) except pp.RepeatedLabelError: row['tag_type'] = 'RepeatedLabelError' [tags.add(t) for t in row.keys()] tagged_rows.append(row) counter = counter + 1 print 'Read {:,} rows'.format(counter)
def tag_ppl(x): try: return probablepeople.tag(x.lower())[1] except: return np.nan
def name_classifier(input_file, output_file): data = pd.read_excel(input_file) data.fillna('', inplace=True) owner_names = data['Owners Names'] buyer_names = data['Last Buyers Names'] output = {'Owners': [], 'Buyers': []} corporate_list = ["CO", "LLC", "TRUST", "LL", "LP", "DEPARTMENT", "PLAN", "OF", "INC", "FAMILY", "PROPERTIES", "REVOCABLE", "ESTATES", "&", "INVESTMENTS"] for name in owner_names: if not name: output['Owners'].append({ 'Last Name': '', 'Middle Name': '', 'First Name': '' }) else: name = name.split(', ') valid_name = [{} for _ in range(len(name))] for num in range(len(name)): name_split = name[num].split(' ') if not any([sub_name in name_split for sub_name in corporate_list]): name_original = name[num] if len(name_split) > 1: if len(name_split[1]) == 1: name_split[0], name_split[-1] = name_split[-1], name_split[0] name[num] = " ".join(name_split) else: name_temp = name_split[1:] name_temp.append(name_split[0]) name[num] = " ".join(name_temp) try: parse = probablepeople.tag(name[num]) parsed_name = parse[0] parsed_type = parse[1] if parsed_type == "Person": if 'LastName' in parsed_name.keys(): valid_name[num]['LastName'] = parsed_name['LastName'] elif 'Surname' in parsed_name.keys(): valid_name[num]['LastName'] = parsed_name['Surname'] elif 'LastInitial' in parsed_name.keys(): valid_name[num]['LastName'] = parsed_name['LastInitial'] else: valid_name[num]['LastName'] = '' if 'MiddleName' in parsed_name.keys(): valid_name[num]['MiddleName'] = parsed_name['MiddleName'] elif 'MiddleInitial' in parsed_name.keys(): valid_name[num]['MiddleName'] = parsed_name['MiddleInitial'] else: valid_name[num]['MiddleName'] = '' if 'GivenName' in parsed_name.keys(): valid_name[num]['FirstName'] = parsed_name['GivenName'] elif 'FirstInitial' in parsed_name.keys(): valid_name[num]['FirstName'] = parsed_name['FirstInitial'] else: valid_name[num]['FirstName'] = '' elif parsed_type == "Household": valid_name[num] = parsed_name # the name is categorized as "Corporation" after shifting # then we use TBD(To Be Decided): original name else: valid_name[num] = {"TBD": name_original} except probablepeople.RepeatedLabelError as e: valid_name[num] = {'original name': e.original_string, 'parsed name': e.parsed_string} # the name is categorized as "Corporation" by our definition else: try: parse = probablepeople.tag(name[num]) parsed_name = parse[0] # parsed_type = parse[1] valid_name[num] = parsed_name except probablepeople.RepeatedLabelError as e: valid_name[num] = {'original name': e.original_string, 'parsed name': e.parsed_string} output['Owners'].append(valid_name) for name in buyer_names: if not name: output['Owners'].append({ 'Last Name': '', 'Middle Name': '', 'First Name': '' }) else: name = name.split('/ ') valid_name = [{} for _ in range(len(name))] for num in range(len(name)): name_split = name[num].split(' ') if not any([sub_name in name_split for sub_name in corporate_list]): name_original = name[num] * 1 if len(name_split) > 1: if len(name_split[1]) == 1: name_split[0], name_split[-1] = name_split[-1], name_split[0] name[num] = " ".join(name_split) else: name_temp = name_split[1:] name_temp.append(name_split[0]) name[num] = " ".join(name_temp) try: parse = probablepeople.tag(name[num]) parsed_name = parse[0] parsed_type = parse[1] if parsed_type == "Person": if 'LastName' in parsed_name.keys(): valid_name[num]['LastName'] = parsed_name['LastName'] elif 'Surname' in parsed_name.keys(): valid_name[num]['LastName'] = parsed_name['Surname'] elif 'LastInitial' in parsed_name.keys(): valid_name[num]['LastName'] = parsed_name['LastInitial'] else: valid_name[num]['LastName'] = '' if 'MiddleName' in parsed_name.keys(): valid_name[num]['MiddleName'] = parsed_name['MiddleName'] elif 'MiddleInitial' in parsed_name.keys(): valid_name[num]['MiddleName'] = parsed_name['MiddleInitial'] else: valid_name[num]['MiddleName'] = '' if 'GivenName' in parsed_name.keys(): valid_name[num]['FirstName'] = parsed_name['GivenName'] elif 'FirstInitial' in parsed_name.keys(): valid_name[num]['FirstName'] = parsed_name['FirstInitial'] else: valid_name[num]['FirstName'] = '' elif parsed_type == "Household": valid_name[num] = parsed_name # the name is categorized as "Corporation" after shifting # then we use TBD(To Be Decided): original name else: valid_name[num] = "TBD: " + name_original except probablepeople.RepeatedLabelError as e: valid_name[num] = {'original name': e.original_string, 'parsed name': e.parsed_string} # the name is categorized as "Corporation" by our definition else: try: parse = probablepeople.tag(name[num]) parsed_name = parse[0] # parsed_type = parse[1] valid_name[num] = parsed_name except probablepeople.RepeatedLabelError as e: valid_name[num] = {'original name': e.original_string, 'parsed name': e.parsed_string} output['Buyers'].append(valid_name) with open(output_file, 'w') as outfile: json.dump(output, outfile)
def test_basic(self): tagged, name_type = tag("Bob Belcher") assert name_type == 'Person' self.assertEqual("Bob", tagged['GivenName']) self.assertEqual("Belcher", tagged['Surname'])
def parse_lines_rules_based_version(lines): """ Parse a sequence of text lines belonging to the "basics" section of a résumé to produce structured data in the form of :class:`schemas.ResumeBasicsSchema` using logical rules based on regular expressions, *PLUS* some pre-trained CRF parsers. Args: lines (List[str]) Returns: Dict[str, obj] """ data = {} for line in lines: if not line: continue for line_chunk in regexes.RE_LINE_DELIM.split(line): if not line_chunk: continue if "email" not in data: match = regexes.RE_EMAIL.search(line_chunk) if match: data["email"] = match.group() start, end = match.span() if start == 0 and end == len(line_chunk): continue else: line_chunk = line_chunk[:start] + line_chunk[end:] if "phone" not in data: match = regexes.RE_PHONE_NUMBER.search(line_chunk) if match: data["phone"] = match.group() start, end = match.span() if start == 0 and end == len(line_chunk): continue else: line_chunk = line_chunk[:start] + line_chunk[end:] if "website" not in data: match = regexes.RE_URL.search(line_chunk) if match: data["website"] = match.group() start, end = match.span() if start == 0 and end == len(line_chunk): continue else: line_chunk = line_chunk[:start] + line_chunk[end:] if "profiles" not in data: match = regexes.RE_USER_HANDLE.search(line_chunk) if match: data["profiles"] = [{"username": match.group()}] start, end = match.span() if start == 0 and end == len(line_chunk): continue else: line_chunk = line_chunk[:start] + line_chunk[end:] if "location" not in data: try: location, location_type = usaddress.tag( line_chunk, tag_mapping=basics.constants.LOCATION_TAG_MAPPING) except usaddress.RepeatedLabelError as e: LOGGER.debug("'location' parsing error:\n%s", e) continue if location_type == "Street Address": location = dict(location) if "recipient" in location: data["name"] = location.pop("recipient") data["location"] = location if "name" not in data: try: name, name_type = probablepeople.tag(line_chunk) except probablepeople.RepeatedLabelError as e: LOGGER.debug("'name' parsing error:\n%s", e) continue if name_type == "Person": data["name"] = " ".join(name.values()) return data
def test_basic(self) : tagged, name_type = tag("Bob Belcher") assert name_type == 'Person' assert "Bob" == tagged['GivenName'] assert "Belcher" == tagged['Surname']