def normalize_person(cls, node, graph): name = max(strip_whitespace(' '.join(node.attrs[x] for x in NAME_PARTS.values() if node.attrs.get(x))), strip_whitespace(node.attrs.get('name', '')), '', key=len) if NULL_RE.match(name): logger.debug('Discarding unnamed agent "%s"', node.attrs.get('name', '')) return graph.remove(node) human = nameparser.HumanName(name) parts = { v: strip_whitespace(human[k]).title() for k, v in NAME_PARTS.items() if strip_whitespace(human[k]) } node.attrs = { 'name': ' '.join(parts[k] for k in NAME_PARTS.values() if k in parts), **parts } if node.attrs.get('location'): node.attrs['location'] = strip_whitespace(node.attrs['location'])
def get_first_name(legislator): given = legislator.get('given_name') if given: return given parsed = nameparser.HumanName(legislator['name']) return parsed.first
def update_fullname(self): human_name = nameparser.HumanName('') for attr in name_parts: value = getattr(self, attr) or '' setattr(human_name, attr, value) self._full = unicode(human_name) self._lfull = self._full.lower()
def get_user_from_headers(message): try: humanName = None emailForward = None uid = None if 'From' in message: (x, y) = email.utils.parseaddr(message.get('From')) if x: humanName = nameparser.HumanName(x) if y: emailForward = y if y.endswith('@debian.org'): uid = y.split('@')[0] if emailForward: result = DebianUser.objects.filter(emailForward=emailForward) if len(result) == 1: return result[0] if humanName: result = DebianUser.objects.filter(cn=humanName.first, sn=humanName.last) if len(result) == 1: return result[0] if uid: result = DebianUser.objects.filter(uid=uid) if len(result) == 1: return result[0] except: pass return None
def map_name(oneline_name, name_object): name = nameparser.HumanName(str(oneline_name)) name_object.first = name['first'] if name['middle']: name_object.middle = name['middle'] name_object.last = name['last'] if name['suffix']: name_object.suffix = name['suffix']
def get_creators(names): res = [] for name in names: name = nameparser.HumanName(name) first = name.first if name.middle: first += ' ' + name.middle res.append('{}, {}'.format(name.last, first)) return res
def make_person_name(person): name = nameparser.HumanName() name.title = person["title"] name.first = person["first name"] name.middle = person["middle name"] name.last = person["last name"] name.suffix = person["suffix"] return unicode(name)
def _parse(self, *args, **kwargs): if not self.attrs.pop('parse') or self.type != 'person': return if not self.attrs.get('name'): self.attrs['name'] = ' '.join(self.attrs[k] for k in ['given_name', 'additional_name', 'family_name', 'suffix'] if self.attrs.get(k)) else: human = nameparser.HumanName(self.attrs['name']) for hk, sk in [('first', 'given_name'), ('middle', 'additional_name'), ('last', 'family_name'), ('suffix', 'suffix')]: if human[hk]: self.attrs[sk] = human[hk]
def parse_name(name): if not isinstance(name, str): return name parsed_name = nameparser.HumanName(name) return Person( first=parsed_name.first if parsed_name.first else None, middle=parsed_name.middle if parsed_name.middle else None, last=parsed_name.last if parsed_name.last else None, raw=name )
def from_fullname(cls, session, name, email=None): parsed_name = nameparser.HumanName(name) return cls.as_unique( session, fname=parsed_name.first.strip('.'), lname=parsed_name.last.strip('.'), mname=parsed_name.middle.strip('.'), title=parsed_name.title.strip('.'), suffix=parsed_name.suffix.strip('.'), nickname=parsed_name.nickname.strip('.'), email=email, )
def __init__(self, reference_text): self.gender_options = [ 'male', 'mostly_male', 'andy', 'mostly_female', "female", "unknown" ] self.gender_results = {key: 0 for key in self.gender_options} self.race_options = [ 'pctwhite', 'pctblack', 'pctapi', 'pctaian', 'pct2prace', 'pcthispanic', 'race_unknown' ] self.ethnicity_results = {key: 0 for key in self.race_options} self.raw_results = {} pickle_path = pathlib.Path( __file__).parent / 'data' / 'ethnicity_lookup.p' csv_path = pathlib.Path( __file__).parent / 'data' / 'Names_2010Census.csv' # Load data if os.path.isfile(pickle_path): self.ethnicity_lookup = pickle.load(open(pickle_path, 'rb')) else: self.ethnicity_lookup = {} with open(csv_path) as csv_file: reader = csv.DictReader(csv_file) for row in reader: self.ethnicity_lookup[row['name']] = {} for race in self.race_options[:-1]: try: value = float(row[race]) except ValueError: value = 0 self.ethnicity_lookup[row['name']][race] = value pickle.dump(self.ethnicity_lookup, open(pickle_path, 'wb')) # Parse names from input self.reference_text = reference_text self.references = bibtexparser.loads(reference_text) logging.info(self.reference_text) logging.info(self.references.entries) self.first_names = [] self.last_names = [] for paper in self.references.entries: if "author" in paper: authors = paper["author"].split(' and ') for person in authors: name = nameparser.HumanName(person) self.first_names.append(name.first) self.last_names.append(name.last) self.raw_results['first_name'] = self.first_names self.raw_results['last_name'] = self.last_names
def build(gdb): if gdb.labels: print('Labels, Nodes, and Relationships already loaded') return lblperson = gdb.labels.create('Person') gdbnames = dict( map( lambda x: (x, lblperson.create(name=x, last_name=nameparser.HumanName(x).last)), generate_data.gen_names())) colors = generate_data.gen_colors() lblevent = gdb.labels.create('Event') gdbevent_types = dict( map( lambda x: (x, lblevent.create( name=x, colors=random.sample(colors, random.randint(1, 3)), user=json.dumps({ 'user_name': 'jhoweyusername', 'password': '******' }))), generate_data.gen_event_types())) lblfoods = gdb.labels.create('Food') gdbfoods = dict( map(lambda x: (x, lblfoods.create(name=x)), generate_data.gen_foods())) with open('events.json') as eventsfile: events_all = json.load(eventsfile) for event_type, events in events_all.items(): for event in events: person = gdbnames[event['name']] cur_event = gdbevent_types[ event_type] #Avoid renaming to event as its loop var :) food = gdbfoods[event['food']] person.relationships.create('Attends', cur_event, confirmed=event['confirmed'], brought=event['food']) person.relationships.create('Brings', food, signup_date=event['signup_date'], attended=event_type)
def clean_name(df): constants = nameparser.config.Constants() constants.titles.add('Major', 'Jonkheer', 'Don') df['nmp_tag'] = df['Name'].apply(lambda x: nameparser.HumanName( full_name=x, constants=constants).as_dict(False)) df['title'] = df['nmp_tag'].apply(lambda x: x.get('title')) df['title'] = df['title'].apply(lambda x: 'Other' if x not in ['Mr.', 'Miss.', 'Mrs.', 'Master'] else x) df['first_name'] = df['nmp_tag'].apply(lambda x: x.get('first')) df['middle_name'] = df['nmp_tag'].apply(lambda x: x.get('middle')) df['last_name'] = df['nmp_tag'].apply(lambda x: x.get('last')) df['suffix'] = df['nmp_tag'].apply(lambda x: x.get('suffix')) df['nickname'] = df['nmp_tag'].apply(lambda x: x.get('nickname')) df.drop(columns=['nmp_tag'], inplace=True) return df
def get_roster(self, team): # assuming players are always addressed in the play-by-play by last names # school name in title form and space inbetween school and self.year try: # for games for which roster are stored in excel spreadsheets roster = pd.read_excel('Rosters/{}.xlsx'.format(' '.join([team.title(), str(self.year)]))) # I also want to convert class (now: fr, so, jr, sr) into the year a player get into college' for i in roster.index: name = roster.at[i, 'Name'] parsed_name = nameparser.HumanName(name) if parsed_name.last == "": # must have last name. if cannot find, use first name as last name parsed_name.last = parsed_name.first parsed_name.first = "" # roster.at[i, 'First'] = parsed_name.first # roster.at[i, 'Middle'] = parsed_name.middle roster.at[i, 'Last'] = parsed_name.last # roster.at[i, 'Suffix'] = parsed_name.suffix # Fake the jersey number: use the index of the player in the roster df as Jersey Number for the moment if roster['Jersey Number'].isnull().all() == True: roster['Jersey Number'] = [str(i) for i in roster.index.tolist()] roster['Jersey Number'] = [str(j) for j in roster['Jersey Number']] roster['Season'] = str(self.year) roster['Team'] = team roster['Player_id'] = ['-'.join(roster.loc[i, ['Season', 'Team', 'Jersey Number']]) for i in roster.index] # find players in the same roster with the same last names # problematic_names = roster['Last'].value_counts().loc[roster['Last'].value_counts() > 1,].index # roster['Problems'] = ['player with same last name exists' * bool(n in problematic_names) for n in # roster['Last']] roster = roster[['Name', 'Last', 'Position', 'Jersey Number', 'Class', 'Player_id']] except: # for the games with active roster MongoDB Collection available matcher = PlayerNameMatcher(team.title(), int(self.year), 'MFB') code = matcher._get_team_code() roster_dict = matcher._get_active_players(code, int(self.year), 'MFB') if roster_dict != {}: roster = pd.DataFrame() roster['Name'] = [roster_dict.get(k).player_name for k in roster_dict.keys()] roster['Last'] = [roster_dict.get(k).player_name.split(",")[0] for k in roster_dict.keys()] roster['Position'] = [roster_dict.get(k).pos for k in roster_dict.keys()] roster['Jersey Number'] = [roster_dict.get(k).jersey_number for k in roster_dict.keys()] roster['Class'] = [roster_dict.get(k).player_class for k in roster_dict.keys()] roster['Player_id'] = [roster_dict.get(k).player_uuid for k in roster_dict.keys()] else: raise NameError('Roster for {} {} is not found'.format(team.title(), self.year)) return roster # a df used in match_players function.
def _map_row_to_instructor_names(self, row): instructor_names = row['Instructor'].split(';') instructor_names_formatted = [] for name in instructor_names: formatted_name = nameparser.HumanName(name) if formatted_name.first and formatted_name.last: instructor_names_formatted.append( f'{formatted_name.last.upper()}, {formatted_name.first.upper()[0]}' ) return { 'instructor': instructor_names_formatted[0] if instructor_names_formatted else '', 'instructors': instructor_names_formatted, }
def perform_inserts(curr, conn, color_dict, event_type_dict, names, foods): with open('events.json') as eventsfile: events_all = json.load(eventsfile) counter = 0 for event_type, events in events_all.items(): normalized_tpls, json_tpls = list(), list() for event in events: tpl_n = ( event_type_dict[event_type], 'jhoweyusername', 'jhoweypassword', color_dict[random.choice(list(color_dict.keys()))], event['name'], event['food'], event['confirmed'], event['signup_date'], ) normalized_tpls.append(tpl_n) tpl_j = ( event_type, 'jhoweyusername', 'jhoweypassword', event['name'], nameparser.HumanName(event['name']).last, random.choice(list(color_dict.keys())), json.dumps(event), ) json_tpls.append(tpl_j) qry_n = 'INSERT INTO events (event_type_id, user_name, password, color_id, name, food, confirmed, signup_date) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)' curr.executemany(qry_n, normalized_tpls) counter += curr.rowcount conn.commit() qry_j = 'INSERT INTO events_json (event_type, user_name, password, metadata_name, metadata_last_name, metadata_color, event) VALUES (%s,%s,%s,%s,%s,%s,%s)' curr.executemany(qry_j, json_tpls) conn.commit() print('Inserted %d events x2' % counter)
def bill_sponsor(person): """Grab complete sponsor information from GPO 'person' object.""" sponsor = {} parsed_name = nameparser.HumanName(person['name']) sponsor['first_name'] = parsed_name['first'] sponsor['last_name'] = parsed_name['last'] sponsor['id'] = person['bioguide_id'] sponsor['state'] = person['state'] sponsor['title'] = person['title'] sponsor['facebook_id'] = None endpoint = 'https://api.propublica.org/congress/v1/members/' \ + sponsor['id'] + '.json' headers = { 'X-API-Key': '76l8Lwp3w45mu6BeOShc17r3H4I264iK2mqMfX1k' } request = requests.get(endpoint, headers=headers) response = json.loads(request.text) if 'results' in response and response['results']: sponsor['party'] = response['results'][0]['current_party'] sponsor['facebook_id'] = response['results'][0]['facebook_account'] else: assert False return sponsor
def __init__(self, name, email='', directorate='', division='', programs=None): parsed_name = nameparser.HumanName(name) self.fname = parsed_name.first.encode('utf-8') self.lname = parsed_name.last.encode('utf-8') self.mname = parsed_name.middle.strip('.').encode('utf-8') self.title = parsed_name.title.strip('.').encode('utf-8') self.suffix = parsed_name.suffix.strip('.').encode('utf-8') self.nickname = parsed_name.nickname.encode('utf-8') self.email = email self.directorate = directorate.upper().strip(string.punctuation) self.division = division.upper().strip(string.punctuation) self.programs = set( p.upper().strip(string.punctuation) for p in programs) if programs is None else set(programs) self.id = Person.ID Person.ID += 1
def create_names_df(data_path='test_data/test_names.txt'): """ Create a DataFrame from the provided data path. Each line in the file should be a single name :param data_path: Path to data :type data_path: str :return: DataFrame, containing names, and column 'last' :rtype: pd.DataFrame """ names_list = list() # Iterate through file, add parsed names to list for line in open(data_path): parsed_name = nameparser.HumanName(line).as_dict() names_list.append(parsed_name) # Convert to DataFrame names_df = pd.DataFrame(names_list) # Uppercase to match Census data names_df['last'] = names_df['last'].apply(lambda x: x.upper()) return names_df
def search_inmates(session): """:py:mod:`bottle` route to handle a GET request for an inmate search. This :py:mod:`bottle` route uses the following GET parameter: :param query: The inmate query string. :type query: str This is used as the query for the inmate search. :returns: :py:mod:`bottle` JSON response containing the following fields: - :py:data:`inmates` JSON encoding of the list of inmates. - :py:data:`errors` List of error strings encountered during search. """ search = bottle.request.query.get("query") if not search: raise bottle.HTTPError(400, "Some search input must be provided") try: inmate_id = int(search.replace("-", "")) inmates, errors = db.query_providers_by_id(session, inmate_id) except ValueError: name = nameparser.HumanName(search) if not (name.first and name.last): message = "If using a name, please specify first and last name" raise bottle.HTTPError(400, message) # pylint: disable=raise-missing-from inmates, errors = db.query_providers_by_name(session, name.first, name.last) return {"inmates": schemas.inmates.dump(inmates), "errors": errors}
def perform_inserts(events_collection): colors = generate_data.gen_colors() with open('events.json') as eventsfile: events_all = json.load(eventsfile) bulk_add = list() for event_type, events in events_all.items(): #Enrich for event in events: event['event_type'] = event_type event['_user'] = {'user_name': 'jhoweyusername', 'password': '******'} #Add fields for our keyword indexing later event['__color'] = random.choice(colors) event['__name'] = event['name'] event['__last_name'] = nameparser.HumanName(event['name']).last bulk_add.append(event) result = events_collection.insert_many(bulk_add) print('Inserted %d events' % len(result.inserted_ids))
def _reorder_author_name(self, author_str, default_to_last_name): """ Automatically detect first and last names in an author name string and reorder using the format: last name, first name """ author = nameparser.HumanName(author_str) if author.first == u'Jr.' and author.suffix != '': author.first = author.suffix author.suffix = u'Jr.' if (author.middle): # Move middle names to first name if detected as so, # or move to last name if detected as so # or move to the default add_to_first = [] add_to_last = [] last_name_found = False middle_name_list = author.middle.split() try: for middle_name in middle_name_list: middle_name_length = len( unicode_entities(middle_name).strip('.').strip('-') ) # Ignore '.' or '-' at the beginning/end of the string middle_name_upper = middle_name.upper() if (middle_name_length <= 2 and middle_name_upper not in self.last_names and "'" not in middle_name) \ or (middle_name_upper in self.first_names and middle_name_upper not in self.last_names) \ or (self.regex_dash.sub('', middle_name_upper) in self.first_names and self.regex_dash.sub('', middle_name_upper) not in self.last_names) \ or (self.regex_quote.sub('', middle_name_upper) in self.first_names and self.regex_quote.sub('', middle_name_upper) not in self.last_names): # Case: First name found # Middle name is found in the first names ADS # list and not in the last names ADS list if last_name_found: # Move all previously detected first names to # last name since we are in a situation where # we detected: # F F L F # hence we correct it to: # L L L F # where F is first name and L is last name add_to_first += add_to_last add_to_last = [] last_name_found = False add_to_first.append(middle_name) elif last_name_found or middle_name.upper( ) in self.last_names: # Case: Last name found add_to_last.append(middle_name) last_name_found = True else: # Case: Unknown # Middle name not found in the first or last names ADS list if default_to_last_name: add_to_last.append(middle_name) last_name_found = True else: add_to_first.append(middle_name) except Exception as e: logging.exception("Unexpected error in middle name parsing") author.first = [author.first] + add_to_first add_to_last.reverse() author.last = add_to_last + [author.last] author.middle = u'' # Verify that no first names appear in the detected last name if (author.last): if isinstance(author.last, basestring): last_name_list = [author.last] else: last_name_list = author.last.split() # At this point we already know it has at least 1 last name and # we will not question that one (in the last position) verified_last_name_list = [last_name_list.pop()] last_name_list.reverse() try: for last_name in last_name_list: last_name_upper = last_name.upper() if last_name_upper in self.first_names and last_name_upper not in self.last_names: author.first = [author.first, last_name] else: verified_last_name_list.append(last_name) except Exception, err: logging.exception("Unexpected error in last name parsing") else: verified_last_name_list.reverse() author.last = verified_last_name_list
def main(args): file = Path.cwd().joinpath('..', 'db', "{}.yml".format(OUTCOLLECTION)) name = nameparser.HumanName(args.name) day = dt.datetime.today().day month = dt.datetime.today().month year = dt.datetime.today().year now = dt.datetime.now() key = "{}{}_{}_{}".format( str(year)[-2:], month_to_str_int(month), name.last.casefold(), name.first.casefold().strip(".")) pdoc = { 'adequacy_of_resources': ['The resources available to the PI seem adequate'], 'agency': args.type, 'competency_of_team': [], 'doe_appropriateness_of_approach': [], 'doe_reasonableness_of_budget': [], 'doe_relevance_to_program_mission': [], 'does_how': [], 'does_what': '', 'due_date': args.due_date, 'freewrite': [], 'goals': [], 'importance': [], 'institutions': [], 'month': 'tbd', 'names': name.full_name, 'nsf_broader_impacts': [], 'nsf_create_original_transformative': [], 'nsf_plan_good': [], 'nsf_pot_to_advance_knowledge': [], 'nsf_pot_to_benefit_society': [], 'status': 'accepted', 'summary': '', 'year': 2020 } if args.title: pdoc.update({'title': args.title}) else: pdoc.update({'title': ''}) if args.requester: pdoc.update({'requester': args.requester}) else: pdoc.update({'requester': ''}) if args.reviewer: pdoc.update({'reviewer': args.reviewer}) else: pdoc.update({'reviewer': 'sbillinge'}) if args.status: if args.status not in ALLOWED_STATI: raise ValueError( "status should be one of {}".format(ALLOWED_STATI)) else: pdoc.update({'status': args.status}) else: pdoc.update({'requester': ''}) fullpdoc = {key: pdoc} sync_coll(file, fullpdoc) print("{} proposal has been added/updated in proposal reviews".format( args.name)) return fullpdoc
def update_name_parts(self, fullname): human_name = nameparser.HumanName(fullname) for attr in name_parts: value = getattr(human_name, attr) or None setattr(self, attr, value)
def db_updater(self): rc = self.rc name = nameparser.HumanName(rc.name) month = dt.datetime.today().month year = dt.datetime.today().year key = "{}{}_{}_{}".format( str(year)[-2:], month_to_str_int(month), name.last.casefold(), name.first.casefold().strip(".")) coll = self.gtx[rc.coll] pdocl = list(filter(lambda doc: doc["_id"] == key, coll)) if len(pdocl) > 0: sys.exit("This entry appears to already exist in the collection") else: pdoc = {} pdoc.update({'adequacy_of_resources': [ 'The resources available to the PI seem adequate'], 'agency': rc.type, 'competency_of_team': [], 'doe_appropriateness_of_approach': [], 'doe_reasonableness_of_budget': [], 'doe_relevance_to_program_mission': [], 'does_how': [], 'does_what': '', 'due_date': rc.due_date, 'freewrite': [], 'goals': [], 'importance': [], 'institutions': [], 'month': 'tbd', 'names': name.full_name, 'nsf_broader_impacts': [], 'nsf_create_original_transformative': [], 'nsf_plan_good': [], 'nsf_pot_to_advance_knowledge': [], 'nsf_pot_to_benefit_society': [], 'status': 'accepted', 'summary': '', 'year': 2020 }) if rc.title: pdoc.update({'title': rc.title}) else: pdoc.update({'title': ''}) if rc.requester: pdoc.update({'requester': rc.requester}) else: pdoc.update({'requester': ''}) if rc.reviewer: pdoc.update({'reviewer': rc.reviewer}) else: pdoc.update({'reviewer': 'sbillinge'}) if rc.status: if rc.status not in ALLOWED_STATI: raise ValueError( "status should be one of {}".format(ALLOWED_STATI)) else: pdoc.update({'status': rc.status}) else: pdoc.update({'status': 'accepted'}) pdoc.update({"_id": key}) rc.client.insert_one(rc.database, rc.coll, pdoc) print("{} proposal has been added/updated in proposal reviews".format( rc.name)) return
def xmlformatname(pname): name = nameparser.HumanName(pname) if (not name.first) or (not name.last): return None return presenter_template.format(FIRSTNAME=html_escape(name.first), LASTNAME=html_escape(name.last))
def db_updater(self): rc = self.rc name = nameparser.HumanName(rc.name) month = dt.datetime.today().month year = dt.datetime.today().year if name.last == '': key = "{}{}_{}".format( str(year)[-2:], month_to_str_int(month), name.first.casefold().strip(".")) else: key = "{}{}_{}_{}".format( str(year)[-2:], month_to_str_int(month), name.last.casefold(), name.first.casefold().strip(".")) coll = self.gtx[rc.coll] pdocl = list(filter(lambda doc: doc["_id"] == key, coll)) if len(pdocl) > 0: sys.exit("This entry appears to already exist in the collection") else: pdoc = {} pdoc.update({'claimed_found_what': [], 'claimed_why_important': [], 'did_how': [], 'did_what': [], 'due_date': rc.due_date, 'editor_eyes_only': '', 'final_assessment': [], 'freewrite': '', 'journal': rc.journal, 'recommendation': '', 'title': rc.title, 'validity_assessment': [], 'year': year }) if rc.reviewer: pdoc.update({'reviewer': rc.reviewer}) else: try: rc.reviewer = rc.default_user_id pdoc.update({'reviewer': rc.reviewer}) except AttributeError: print( "Please set default_user_id in '~/.config/regolith/user.json', or you need to enter your group id " "in the command line") return if rc.submitted_date: pdoc.update({'submitted_date': rc.submitted_date}) else: pdoc.update({'submitted_date': 'tbd'}) if rc.name: if name.last == '': pdoc.update({'first_author_last_name': name.first}) else: pdoc.update({'first_author_last_name': name.last}) if rc.requester: pdoc.update({'requester': rc.requester}) else: pdoc.update({'requester': ''}) if rc.status: if rc.status not in ALLOWED_STATI: raise ValueError( "status should be one of {}".format(ALLOWED_STATI)) else: pdoc.update({'status': rc.status}) else: pdoc.update({'status': 'accepted'}) pdoc.update({"_id": key}) rc.client.insert_one(rc.database, rc.coll, pdoc) print("{} manuscript has been added/updated in manuscript reviews".format( rc.name)) return
def parseCaption(caption): # given a caption, return a list of names of people in the caption # if no names found, or caption not about people, return None debug = False flag = False rejects = [] if debug: print " " print caption names = [] chunks = [] caplen = len(caption) if (caplen > 1) & (caplen < 250): # ignore short/long descriptions ## split on 'AT' and 'IN' # throw away everything after "so-and-so AT the ball" if findAt.search(caption): rejects.append(caption[findAt.search(caption).start():]) caption = caption[:findAt.search(caption).start()].strip() if ' in ' in caption: tmp = caption.split(' in ') rejects.append(tmp[1]) caption = tmp[0] if debug: print "After at/in : ", caption ### skip if less than 4 words if len(caption.split()) < 4: if debug: print " nWords < 4" return None # each chunk is a set of words chunks = fpunkt.split(caption) ### fix Jr. / Sr. / Dr. for ind, chunk in enumerate(chunks): if fsr.search(chunk): print chunk spl = fjr.split(chunk) chunk = ''.join(spl).strip() print chunk for ind, chunk in enumerate(chunks): if fjr.search(chunk): # print print chunk spl = fjr.split(chunk) chunk = ''.join(spl).strip() print chunk for ind, chunk in enumerate(chunks): if fdr.search(chunk): print print "Dr. Sub" print chunk spl = fdr.split(chunk) chunks[ind] = ''.join(spl).strip() print chunks ## ^^ only the Dr. one works, because of Python's scoping. ## Could fix the others if needed if debug: print chunks ### split at "Bob WITH Kate" for ind, chunk in enumerate(chunks): # if 'with' in chunk: if fwith.search(chunk): pieces = chunk.split(' with ') chunks[ind] = pieces[0] chunks.insert(ind + 1, pieces[1]) ### handling 'AND' ### # checks first word of split (implies a list, and Bob) for ind, chunk in enumerate(chunks): if fand.match(chunk): # this re. defined above chunks[ind] = chunks[ind][5:] # everything after the "and " chunk = chunks[ind] #print "post-fand chunks : ", chunks # separating "Husb and Wife Smith", etc if ' and ' in chunk: #print "found AND " temp = chunk.split(' and ') name1 = temp[0].strip() name2 = temp[1].strip() human1 = parser.HumanName(name1) human2 = parser.HumanName(name2) if debug: print "ind = ", ind, "; temp = ", temp print "name1 = ", name1, "; name2 = ", name2 print "human1 = ", human1 print "human2 = ", human2 # if this was of the form "Husband and Wife Smith" if not human1.last: human1 = temp[0].strip() + ' ' + human2.last else: human1 = name1 chunks[ind] = human1 chunks.insert(ind + 1, temp[-1]) ### check for capitalized words to see if this is names ### cutList = [] for ind, chunk in enumerate(chunks): words = chunk.split() nWords = len(words) if nWords: # check ratio of caps to not nCaps = sum(map(str.isupper, str(chunk))) # ratio = float(nCaps)/nWords if (nWords - nCaps) > 1: cutList.append(chunk) if debug: print "no caps" rejects.append(cutList) for cut in cutList: chunks.remove(cut) if debug: print print "after capitals :" print " reject : ", rejects print " keep :", chunks dan.danpause() ### cut chunks with 'The' cutList = [] for chunk in chunks: if fthe.search(chunk): # print chunks # should probably check if it's already there if chunk not in cutList: cutList.append(chunk) rejects.append(cutList) for cut in cutList: chunks.remove(cut) if debug: print ' cutting "the" : ', chunks ### upon exit ### if len(chunks) > 1: # need more than one person for chunk in chunks: chunk = chunk.strip() if len(chunk.split()) > 1: # make sure it doesn't say "friend" if not ffrnd.search(chunk): # strip whitespace, condense multispaces names.append(re.sub('\s+', ' ', chunk.strip())) else: print "rej : ", chunk rejects.append(chunk) # print names return names else: return None