def __check_if_japanese(self, text, syllables): name_list = [] names = self.__get_human_names(text) for name in names: if self.__word_break(HumanName(name).last.lower(), syllables) and \ self.__word_break(HumanName(name).first.lower(), syllables): name_list.append(name) return name_list
def __init__(self, account, name, role, start_date, end_date, hourly_rate, utilization, client, office=None): self.role = role self.account = account parsed_name = HumanName(name.lower()) self.first_name = parsed_name.first self.middle_name = parsed_name.middle self.last_name = parsed_name.last self.client = client self.office = office self.converted_fulltime = False self.start_date = start_date self.end_date = end_date self.hourly_rate = hourly_rate self.utilization = utilization
def impute_names_model(name): human = HumanName(name) return { 'given_name': human.first, 'middle_names': human.middle, 'family_name': human.last, 'suffix': human.suffix, }
def impute_names(name): human = HumanName(name) return { 'given': human.first, 'middle': human.middle, 'family': human.last, 'suffix': human.suffix, }
def get_human_names(text): en = spacy.load('en') sents = en(text) person_list = [] for word in sents.ents: if word.label_ == 'PERSON': person_list.append(str(word).replace('\n', '')) result = [] for name in person_list: try: first_last = str(HumanName(name).first).replace( ' ', '') + ' ' + str(HumanName(name).last).replace(' ', '') result.append(first_last) except: print('error') return result
def findGossip(self, tweets, keywords, hashtag): ''' From a collection of tweets, apply the logic to find duplicates and refine text (commented out for now) ''' self.tweets = tweets self.hits = {} self.logger.info("Fetching latest gossip from collection of tweets") keywords = [] # ['bid','reports','according','sign'] team = hashtag.split('#')[1] refined_text = self.tweets #self.removeTeamPlayers(team) keyword_tweets = refined_text #[tweet for tweet in refined_text for keyword in keywords if keyword in tweet] tweets = self.removeDuplicates(keyword_tweets) players = [] for match in tweets: hits = self.findTargets(match) for name in hits: last_first = HumanName(name).last + ', ' + HumanName( name).first players.append(last_first) players = self.findDuplicates(players) self.hits = {player: len(hit) for player, hit in players.iteritems()} self.logger.info('Things gossiped about {}'.format(self.hits))
def preProcess(names): #Converting names to HumanName class objects fullName = [] for x,name in names.iterrows(): fullName.append(HumanName(name.fn + ' ' + name.ln)) names['fullName'] = fullName #Converting dob to pandas.datetime object (assuming that entries are not older than 100 years) dobs = [] for entry in names.dob: if(int(entry[-2:]) < 18): dob = 2000+int(entry[-2:]) else: dob = 1900+int(entry[-2:]) dobs.append(str(entry[:-2])+str(dob)) names.dob = dobs names.dob = pd.to_datetime(names.dob)
def __init__(self, account, name, email, office, role, salary, start_date): self.account = account parsed_name = HumanName(name.lower()) self.first_name = parsed_name.first self.middle_name = parsed_name.middle self.last_name = parsed_name.last self.email = email.lower() self.is_administrator = False self.is_hr_administrator = False self.employee_assignment_access = False self.permissions_global_pipeline = False self.permissions_global_financials = False self.permissions_global_utilization = False self.office = office self.role = role self.salary = salary self.is_active = True self.created_at = db_utc_now() self.start_date = db_utc_now() self.percent_billable = 100 if role.department is not None: self.department = self.role.department else: self.department = None #self.next_role = self.role.next_role self.employee_number = 0 self.end_date = None self.start_date = start_date jh = JobHistoryEntry(self, role, salary) self.job_history.append(jh)
def get_human_names(text): tokens = nltk.tokenize.word_tokenize(text) pos = nltk.pos_tag(tokens) sentt = nltk.ne_chunk(pos, binary=False) names = [] person = [] name = "" for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'): for leaf in subtree.leaves(): person.append(leaf[0]) if len(person) > 1: # ignore surnames only for part in person: name += part + ' ' if name[:-1] not in names: names.append(name[:-1]) name = '' person = [] print(names) first_names = [] for name in names: first_name = HumanName(name).first first_names.append(first_name) return first_names
def person_edit(request): try: account_id = long(request.session['aid']) user_id = long(request.session['uid']) user = DBSession.query(User).filter_by(id=user_id).first() account = DBSession.query(Account).filter_by(id=account_id).first() if user is None or account is None or (user.is_administrator == False and user.is_hr_administrator == False): return HTTPFound(request.application_url) person_id = long(request.matchdict['person_id']) person = DBSession.query(User).filter_by(id=person_id).first() if request.method == "GET": try: source = request.params["source"] except: source = 'administration' if request.method == "POST": name = request.params["name"].lower() employee_number = request.POST.get('employee_number') source = request.params["source"] email = request.params["email"].lower() salary = long(request.params["salary"]) is_raise = request.POST.get('raise') change_allocation = request.POST.get('change_allocation') office_id = request.POST.get('office_id') if office_id == '': office = None else: office_id = long(office_id) office = DBSession.query(Office).filter_by(id=office_id).filter_by(account_id=account_id).first() role_id = long(request.params["role_id"]) role = DBSession.query(Role).filter_by(id=role_id).filter_by(account_id=account_id).first() percent_billable = long(request.params["percent_billable"]) start_date_text = request.POST.get('start_date') if start_date_text == '': start_date = datetime.datetime.now() else: start_dateparts = start_date_text.split("/") start_date = datetime.date(long(start_dateparts[2]), long(start_dateparts[0]), long(start_dateparts[1])) end_date_text = request.POST.get('end_date') if end_date_text == '': end_date = None else: end_dateparts = end_date_text.split("/") end_date = datetime.date(long(end_dateparts[2]), long(end_dateparts[0]), long(end_dateparts[1])) currency_id = request.POST.get('currency_id') if currency_id != '': currency_id = long(currency_id) currency = DBSession.query(Currency).filter_by(id=currency_id).first() else: currency = None is_a = long(request.POST.get('is_administrator', '0')) if is_a == 1: is_administrator = True else: is_administrator = False is_h_a = long(request.POST.get('is_hr_administrator', '0')) if is_h_a == 1: is_hr_administrator = True else: is_hr_administrator = False is_e_a = long(request.POST.get('employee_assignment_access', '0')) if is_e_a == 1: employee_assignment_access = True else: employee_assignment_access = False u = DBSession.query(User).filter_by(email=email).first() if u is None or u.id == person_id: permissions_office_financials = request.params.getall("permissions_office_financials") permissions_global_financials = False for office_id in permissions_office_financials: if office_id == "all": permissions_global_financials = True break permissions_office_utilization = request.params.getall("permissions_office_utilization") permissions_global_utilization = False for office_id in permissions_office_utilization: if office_id == "all": permissions_global_utilization = True break permissions_office_pipeline = request.params.getall("permissions_office_pipeline") permissions_global_pipeline = False for office_id in permissions_office_pipeline: if office_id == "all": permissions_global_pipeline = True break parsed_name = HumanName(name.lower()) person.first_name = parsed_name.first person.middle_name = parsed_name.middle person.last_name = parsed_name.last person.employee_number = employee_number person.email = email.lower() person.salary = salary person.office = office person.role = role person.percent_billable = percent_billable person.start_date = start_date person.end_date = end_date person.currency = currency person.is_administrator = is_administrator person.is_hr_administrator = is_hr_administrator person.employee_assignment_access = employee_assignment_access person.permissions_global_financials = permissions_global_financials person.permissions_global_utilization = permissions_global_utilization person.permissions_global_pipeline = permissions_global_pipeline person.permissions_office_financials = [] if person.permissions_global_financials == False: for office_id in permissions_office_financials: office = DBSession.query(Office).filter_by(account_id=account_id).filter_by( id=office_id).first() if office is not None: person.permissions_office_financials.append(office) person.permissions_office_utilization = [] if person.permissions_global_utilization == False: for office_id in permissions_office_utilization: office = DBSession.query(Office).filter_by(account_id=account_id).filter_by( id=office_id).first() if office is not None: person.permissions_office_utilization.append(office) person.permissions_office_pipeline = [] if person.permissions_global_pipeline == False: for office_id in permissions_office_pipeline: office = DBSession.query(Office).filter_by(account_id=account_id).filter_by( id=office_id).first() if office is not None: person.permissions_office_pipeline.append(office) person.permissions_client_financials = [] permissions_client_financials = request.params.getall("permissions_client_financials") for client_id in permissions_client_financials: client = DBSession.query(Client).filter_by(account_id=account_id).filter_by(id=client_id).first() if client is not None: person.permissions_client_financials.append(client) person.permissions_client_utilization = [] permissions_client_utilization = request.params.getall("permissions_client_utilization") for client_id in permissions_client_utilization: client = DBSession.query(Client).filter_by(account_id=account_id).filter_by(id=client_id).first() if client is not None: person.permissions_client_utilization.append(client) person.permissions_client_pipeline = [] permissions_client_pipeline = request.params.getall("permissions_client_pipeline") for client_id in permissions_client_pipeline: client = DBSession.query(Client).filter_by(account_id=account_id).filter_by(id=client_id).first() if client is not None: person.permissions_client_pipeline.append(client) person.permissions_department_financials = [] permissions_department_financials = request.params.getall("permissions_department_financials") for department_id in permissions_department_financials: department = DBSession.query(Department).filter_by(account_id=account_id).filter_by( id=client_id).first() if department is not None: person.permissions_department_financials.append(department) person.permissions_department_utilization = [] permissions_department_utilization = request.params.getall("permissions_client_utilization") for department_id in permissions_client_utilization: department = DBSession.query(Department).filter_by(account_id=account_id).filter_by( id=department_id).first() if department is not None: person.permissions_department_utilization.append(department) s = DBSession.query(Salary).filter_by(user_id=person.id).order_by(Salary.start_date.desc()).first() if (is_raise is not None or change_allocation is not None) and (datetime.datetime.now().date() != s.start_date.date()): s = Salary(person, salary, role.id, datetime.datetime.now(), percent_billable) DBSession.add(s) else: s.salary = salary s.role_id = int(role.id) s.percent_billable = percent_billable DBSession.flush() if source == "reviews": return HTTPFound(request.application_url + "/people/all/all/all") else: return HTTPFound(request.application_url + "/administration/employees") departments = DBSession.query(Department).filter_by(account_id=account_id).all() offices = DBSession.query(Office).filter_by(account_id=account_id).all() clients = DBSession.query(Client).filter_by(account_id=account_id).all() roles = DBSession.query(Role).filter_by(account_id=account_id).all() currencies = DBSession.query(Currency).filter_by(account_id=account_id).all() return dict(logged_in=authenticated_userid(request), header=Header(source), departments=departments, offices=offices, clients=clients, roles=roles, user=user, person=person, currencies=currencies) except: traceback.print_exc() return HTTPFound(request.application_url)
print(url) #print() print(e) From_KnownNames.clear() TEXT.clear() counter = counter + 1 topStoriesDict[key] = current try: ####################################### Beautiful Soup ###################################################### theStorySoFar = urlToString(url) ####################################### NLTK ########################################## names = get_human_names(theStorySoFar) nameListNLTK = [] for name in names: firname = str(HumanName(name).first) laname = str(HumanName(name).last) try: alphabetize(firname) alphabetize(laname) except Exception as e: blankvar = 0 #print() #print("Either the first or last name is missing, so alphabetize failed. No big deal") if name not in current.author: nameListNLTK.append(name) thisArticlesSourcesDict["HTML_BS"]["NLTK"] = nameListNLTK #print("NLTKS are: ", nameListNLTK) current.sources = thisArticlesSourcesDict
def last_and_first_name(name_list1, name_list2): name_listf = [] name_listl = [] name_list1f = [] name_list1l = [] name_list2f = [] name_list2l = [] for name in name_list1: first = "" last = "" #extra logic needed for those first names which are put in suffix part of name if "." in HumanName(name).suffix: na = HumanName(name).suffix.split(',')[1] first = na.split()[0] last = HumanName(name).first else: first = HumanName(name).first last = HumanName(name).last name_list1f.append(first) name_list1l.append(last) for name in name_list2: first = "" last = "" #extra logic needed for those first names which are put in suffix part of name if "." in HumanName(name).suffix: na = HumanName(name).suffix.split(',')[1] first = na.split()[0] last = HumanName(name).first else: first = HumanName(name).first last = HumanName(name).last name_list2f.append(first) name_list2l.append(last) # for item in name_list: # print item name_listf = name_list1f + name_list2f name_listl = name_list1l + name_list2l with open('output.csv', 'w') as csvfile: fieldnames = ['first_name', 'last_name'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for row in zip(name_listf, name_listl): writer.writerow({'first_name': row[0], 'last_name': row[1]}) return (name_list1f, name_list1l, name_list2f, name_list2l)
def process(self, statement): #Question process function statement = str(statement).lower() fields = [ 'classes', 'class', 'deliverables', 'practicals', 'theoreticals', 'defense', 'evaluation', 'exame', 'aula' ] temporals = ['today', 'tomorrow', 'next'] field = [] temporal = [] statem = '' statetokens = word_tokenize(str(statement)) names = self.get_human_names(str(statement)) classes = self.get_classes_names(str(statement)) #remove capitalizations for i in range(len(statetokens)): statetokens[i] = statetokens[i].lower() #Portuguese converter for token in statetokens: if (token in fields): if (token == 'classes' or token == 'class' or token == 'aula'): field.append('Aula') else: field.append(token) if (token in temporals): temporal.append(token) if (self.is_date(token)): dateparse = parse(token) temporal.append(token) print(field) print(temporal) if ((len(field) == 0) or (len(temporal) == 0)): response = Statement('Sorry didnt understand') response.confidence = 0 return response for name in names: last_first = HumanName(name).first field.append(last_first) saved = [] k = 0 INFORMATIONGIVEN = 1 #-------------------------------------------------MultipleInputs---------------------------------------------------------------------------- #Section for understanding if there are more than one intention #Based on a if else tree based on the detection of intents if ((len(field) > 1) or (len(temporal) > 1)): if (('or' in statetokens) or ('and' in statetokens)): for i in range(0, len(field)): for j in range(0, len(temporal)): temporalsend = ''.join(temporal[j]) fieldsend = ''.join(field[i]) statem = statem + self.make_statement_from_select( self.get_specific_classes(fieldsend, temporalsend), INFORMATIONGIVEN) statem = 'You have:' + statem response = Statement(statem) response.confidence = 1 return response elif (('with' in statetokens)): for i in range(0, len(field)): for j in range(0, len(temporal)): temporalsend = ''.join(temporal[j]) fieldsend = ''.join(field[i]) help = self.get_specific_classes( fieldsend, temporalsend) rets = [] if k == 0: saved = help k = k + 1 else: for term in help: if (term in saved): rets.append(term) statem = 'You have:' + self.make_statement_from_select( rets, INFORMATIONGIVEN) response = Statement(statem) response.confidence = 1 return response else: if (len(classes) > 0): for j in range(0, len(temporal)): for i in range(0, len(classes)): temporalsend = ''.join(temporal[j]) fieldsend = ''.join(classes[i]) fetchedevents = self.get_named_classes( fieldsend, temporalsend) statem = statem + self.statement_parsing( fetchedevents) response = Statement(statem) response.confidence = 1 return response else: response = Statement( 'Im not understanding your question very much, please reformulate it' ) response.confidence = 0 return response #------------------------------------------------------------------------------------------------------------------------------------------- #----------------------------------------------Single Inputs, need better modularization---------------------------------------------------- else: #Informal decision tree creation if ('Aula' in field): if ('next' in temporal): statem = self.make_statement_from_select( self.get_next_classes(), INFORMATIONGIVEN) statem = 'Next, you have:' + statem response = Statement(statem) response.confidence = 1 return response else: dateformated = ''.join(temporal) datefind = parse(dateformated) fetchedevents = self.get_specifictime_classes(datefind) statem = self.statement_parsing(fetchedevents) response = Statement(statem) response.confidence = 1 return response if ('deliverables' in field): dateformated = ''.join(temporal) fetchedevents = self.get_specific_classes(field, dateformated) statem = statem + self.statement_parsing(fetchedevents) response = Statement(statem) response.confidence = 1 return response if ('practical' in field): dateformated = ''.join(temporal) fetchedevents = self.get_specific_classes(field, dateformated) statem = statem + self.statement_parsing(fetchedevents) response = Statement(statem) response.confidence = 1 return response if ('theoretical' in field): dateformated = ''.join(temporal) fetchedevents = self.get_specific_classes(field, dateformated) statem = statem + self.statement_parsing(fetchedevents) response = Statement(statem) response.confidence = 1 return response if ('defense' in field): dateformated = ''.join(temporal) fetchedevents = self.get_specific_classes(field, dateformated) statem = statem + self.statement_parsing(fetchedevents) response = Statement(statem) response.confidence = 1 return response if ('evaluation' in field): dateformated = ''.join(temporal) fetchedevents = self.get_specific_classes(field, dateformated) statem = statem + self.statement_parsing(fetchedevents) response = Statement(statem) response.confidence = 1 return response if ('exame' in field): dateformated = ''.join(temporal) fetchedevents = self.get_specific_classes(field, dateformated) statem = statem + self.statement_parsing(fetchedevents) response = Statement(statem) response.confidence = 1 return response response = Statement( 'Im dont understand very much, please reformulate the question friend' ) response.confidence = 0 return response
TEXT.append(arrowsplitend[0]) fulltext = ' '.join(TEXT) #print(counter,",",fulltext) #print("Something Happened...is Good.") text = fulltext names = get_human_names(text) nameCountiterate = 0 ##### ## print("Names being added from", key, " are: ") ## for name in names: ## print(name) ##### for name in names: sourceNumber = ("source" + str(nameCountiterate)) firname = str(HumanName(name).first) laname = str(HumanName(name).last) try: alphabetize(firname) alphabetize(laname) except Exception as e: blankvar = 0 #print() #print("Either the first or last name is missing, so alphabetize failed. No big deal") #print(e) thisArticlesSourcesDict["NLTK"][name] = { "sourceNumber": sourceNumber, "firstName": HumanName(name).first, "lastName": HumanName(name).last } current.sources = thisArticlesSourcesDict
def get_name(self): name = HumanName(self.full_name) name.capitalize() return name
def freelancer_edit(request): try: freelancer_id = long(request.matchdict['freelancer_id']) account_id = long(request.session['aid']) user_id = long(request.session['uid']) user = DBSession.query(User).filter_by(id=user_id).first() account = DBSession.query(Account).filter_by(id=account_id).first() if user is None or account is None: return HTTPFound(request.application_url) freelancer = DBSession.query(Freelancer).filter_by(account_id=account_id).filter_by(id=freelancer_id).first() if freelancer is None: return HTTPFound(request.application_url) if request.method == "POST": name = request.params["name"].lower() role_id = long(request.params["role_id"]) client_id = request.params.get('client_id') office_id = request.params.get('office_id') client = None if client_id is not None and client_id != '' and len(client_id) > 0: client = DBSession.query(Client).filter_by(account_id=account_id).filter_by(id=long(client_id)).first() office = None if office_id is not None and office_id != '' and len(office_id) > 0: office = DBSession.query(Office).filter_by(account_id=account_id).filter_by(id=long(office_id)).first() if client is None and office is None: return HTTPFound(request.application_url) utilization = long(request.params["utilization"]) hourly_rate_local = long(request.params["hourly_rate"]) if user.currency is None: hourly_rate = hourly_rate_local else: hourly_rate = hourly_rate_local * user.currency.currency_to_usd start_date_text = request.params["start_date"] start_dateparts = start_date_text.split("/") start_date = datetime.date(long(start_dateparts[2]), long(start_dateparts[0]), long(start_dateparts[1])) end_date_text = request.params["end_date"] end_dateparts = end_date_text.split("/") end_date = datetime.date(long(end_dateparts[2]), long(end_dateparts[0]), long(end_dateparts[1])) role = DBSession.query(Role).filter_by(id=role_id).filter_by(account_id=account_id).first() if role is None or office is not None and user.can_access_office(office, "utilization") == False: return HTTPFound(request.application_url) if client is not None and user.can_access_client(client, "utilization") == False: return HTTPFound(request.application_url) parsed_name = HumanName(name) freelancer.first_name = parsed_name.first freelancer.middle_name = parsed_name.middle freelancer.last_name = parsed_name.last freelancer.office = office freelancer.client = client freelancer.start_date = start_date freelancer.end_date = end_date freelancer.utilization = utilization freelancer.hourly_rate = hourly_rate DBSession.flush() if client is not None: return HTTPFound(request.application_url + "/client/" + str(client_id) + "/utilization/" + str( datetime.datetime.now().year)) if office is not None: return HTTPFound(request.application_url + "/office/" + str(office_id) + "/utilization/" + str( datetime.datetime.now().year)) clients_all = DBSession.query(Client).filter_by(account_id=account_id).all() clients = [] if user.is_administrator or user.permissions_global_utilization: clients = clients_all else: for client in clients_all: if user.can_access_client(client, "utilization"): clients.append(client) if len(clients) == 0: print("************* no c") return HTTPFound(request.application_url) offices_all = DBSession.query(Office).filter_by(account_id=account_id).all() offices = [] if user.is_administrator or user.permissions_global_utilization: offices = offices_all else: for office in offices_all: if user.can_access_office(office, "utilization"): offices.append(office) if len(offices) == 0: print("************* no o") return HTTPFound(request.application_url) roles = DBSession.query(Role).filter_by(account_id=long(request.session['aid'])).all() return dict(logged_in=authenticated_userid(request), header=Header('financials'), clients=clients, offices=offices, roles=roles, freelancer=freelancer, user=user, account=account) except: print("*************") traceback.print_exc() return HTTPFound(request.application_url)
def namethatname(self, results, letter, text, variable, filename): results.write("\n----------------------------------------------") results.write("\nMost common employee names:\n ") name = [] def wordCount(self, text, word): count = 0 for t in text: if word.upper() in t.upper(): count = count+1 return count def get_human_names(self, text): tokens = nltk.tokenize.word_tokenize(text) count = 0 pos = nltk.pos_tag(tokens) sentt = nltk.ne_chunk(pos, binary = False) person_list = [] person = [] name = "" namer = [] endofdays = len(tokens)-1 for t in tokens: if '*' in t: if count + 1 <= endofdays: firstname = tokens[count + 1] if firstname[0].isupper() == True: name = (tokens[count + 1]) if name in namer: continue else: namer.append(name) if count + 2 <= endofdays: lastname = tokens[count + 2] if lastname[0].isupper() == True: nam = (name + ' ' + tokens[count + 2]) if nam in names: continue else: namer.append count = count + 1 for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'): for leaf in subtree.leaves(): person.append(leaf[0]) for part in person: name += part + ' ' if name[:-1] not in person_list: person_list.append(name[:-1]) name = '' person = [] for n in namer: if n not in person_list: n.rstrip() n.lstrip() person_list.append(n) return (person_list) names = [] texty = ' '.join(text) if variable == 1: tame = [] for t in text: if "NEGATIVE" in t.upper(): tame.append(t) tamer = ' '.join(tame) names = get_human_names(self, tamer) NotNames = open(str(letter)+":/Comment-Graph Report/User Tools/Not Names.txt").read().splitlines() final = [] for n in names: last = HumanName(n).last first = HumanName(n).first check = first + " " + last check.rstrip() check.lstrip() if first.upper() not in NotNames and last.upper() not in NotNames: check = first + " " + last final.append(check) continue; if first.upper() not in NotNames and last.upper() in NotNames: final.append(first) continue; if first.upper() in NotNames and last.upper() not in NotNames: final.append(last) continue; countdown = [] for f in final: if f not in countdown: fan = str(f) fun = fan.strip() if fun != " " and fun != '': countdown.append(fun) for c in countdown: num = wordCount(self, tame, c) if num > 1: results.write(str(c) + ' appears this many times: ' + str(wordCount(self, tame, c))) if variable == 2: tame = [] for t in text: if "POSITIVE" in t.upper(): tame.append(t) tamer = ' '.join(tame) names = get_human_names(self, tamer) NotNames = open(str(letter)+":/Comment-Graph Report/User Tools/Not Names.txt").read().splitlines() final = [] for n in names: last = HumanName(n).last first = HumanName(n).first check = first + " " + last if first.upper() not in NotNames and last.upper() not in NotNames: check = first + " " + last if check not in final: final.append(check) continue; if first.upper() not in NotNames and last.upper() in NotNames: if first not in final: first = first.rstrip() first = first.lstrip() final.append(first) continue; if first.upper() in NotNames and last.upper() not in NotNames: if last not in final: last = last.rstrip() last = last.lstrip() final.append(last) continue; countdown = [] for f in final: if f not in countdown: fan = str(f) fun = fan.strip() if fun != " " and fun != '': countdown.append(fun) for c in countdown: num = wordCount(self, tame, c) if num > 1: results.write('\n' + str(c) + ' appears this many times: ' + str(wordCount(self, tame, c))) if variable == 3: tame = [] for t in text: if "NEUTRAL" or "MIXED" in t.upper(): tame.append(t) tamer = ' '.join(tame) names = get_human_names(self, tamer) NotNames = open(str(letter)+":/Comment-Graph Report/User Tools/Not Names.txt").read().splitlines() final = [] for n in names: last = HumanName(n).last first = HumanName(n).first check = first + " " + last check.rstrip() check.lstrip() if first.upper() not in NotNames and last.upper() not in NotNames: check = first + " " + last final.append(check) continue; if first.upper() not in NotNames and last.upper() in NotNames: final.append(first) continue; if first.upper() in NotNames and last.upper() not in NotNames: final.append(last) continue; countdown = [] for f in final: if f not in countdown: fan = str(f) fun = fan.strip() if fun != " " and fun != '': countdown.append(fun) for c in countdown: num = wordCount(self, tame, c) if num > 1: results.write('\n'+str(c) + ' appears this many times: ' + str(wordCount(self, tame, c))) if variable == 4: tame = [] for t in text: tame.append(t) tamer = ' '.join(tame) names = get_human_names(self, tamer) NotNames = open(str(letter)+":/Comment-Graph Report/User Tools/Not Names.txt").read().splitlines() final = [] for n in names: last = HumanName(n).last first = HumanName(n).first check = first + " " + last check.rstrip() check.lstrip() if first.upper() not in NotNames and last.upper() not in NotNames: check = first + " " + last final.append(check) continue; if first.upper() not in NotNames and last.upper() in NotNames: final.append(first) continue; if first.upper() in NotNames and last.upper() not in NotNames: final.append(last) continue; countdown = [] for f in final: if f not in countdown: fan = str(f) fun = fan.strip() if fun != " " and fun != '': countdown.append(fun) for c in countdown: num = wordCount(self, tame, c) if num > 1: results.write('\n'+str(c) + ' appears this many times: ' + str(wordCount(self, tame, c))) return countdown
from nameparser.parser import HumanName def humannames(text): tokens = nltk.tokenize.word_tokenize(text) pos = nltk.pos_tag(tokens) sentiment = nltk.ne_chunk(pos, binary = False) list_of_person = [] person = [] name = "" for subtree in sentiment.subtrees(filter=lambda t: t.label() == 'PERSON'): for leaf in subtree.leaves(): person.append(leaf[0]) if len(person) > 1: for p in person: name += p + ' ' if name[:-1] not in list_of_person: list_of_person.append(name[:-1]) name = '' person = [] return (list_of_person) with open("pressReleases.csv") as f: text = f.read() + '\n' names = humannames(text) for name in names: print(name) last_first = HumanName(name).last + ', ' + HumanName(name).first print(last_first)
def preProcess(names): suffixes = ["JR", "SR", "I", "II", "III", "IV"] #Converting the dates in proper format dobs = [] for entry in names.dob: dob = 1900 + int(entry[-2:]) if (dob > 2018): dob -= 100 dobs.append(str(entry[:-2]) + str(dob)) names.dob = dobs names.dob = pd.to_datetime(names.dob) #Create a new dataframe which compares every entry with every other entry entries = [] for i in range(len(names)): for j in range(i + 1, len(names)): record = {} record['ln1'] = names.ln[i] record['dob1'] = names.dob[i] record['gn1'] = names.gn[i] record['fn1'] = names.fn[i] record['ln2'] = names.ln[j] record['dob2'] = names.dob[j] record['gn2'] = names.gn[j] record['fn2'] = names.fn[j] entries.append(record) namesCross = pd.DataFrame(entries) #Comparing DOBs, Father's Names, First Names, Last Names etc ln1 = [] ln2 = [] gender = [] father = [] dob = [] first = [] editdist = [] last = [] lastFirst = [] ln1_length = [] ln2_length = [] for i in range(len(namesCross)): name1 = HumanName(namesCross['ln1'][i].upper()) name2 = HumanName(namesCross['ln2'][i].upper()) #Check if gender of both entries is same if (namesCross['gn1'][i] == namesCross['gn2'][i]): gender.append(1) else: gender.append(0) #Check if father's name for both entries is same if (checkFname(namesCross['fn1'][i], namesCross['fn2'][i])): father.append(1) else: father.append(0) #Check if DOB for both the entries is same if (namesCross['dob1'][i] == namesCross['dob2'][i]): dob.append(1) else: dob.append(0) #Check if the first name of both the entries is same if (name1.first == name2.first): first.append(1) else: first.append(0) #Calculate edit distance between first names of both entries editdist.append(editdistance.eval(name1.first, name2.first)) #If the last name is a suffix categorize it correctly if (name1.last.upper() in suffixes): name1.suffix = name1.last name1.last = '' if (name2.last.upper() in suffixes): name2.suffix = name2.last name2.last = '' #Check if last names of both the entries are same if (name1.last == name2.last or name1.last == '' or name2.last == ''): last.append(1) else: last.append(0) #Check if the first character of last name of both entries is same while one of the entries is having abbreviated last name if ((name1.last != '' and name2.last != '') and (name1.last == name2.last[0] or name1.last[0] == name2.last)): lastFirst.append(1) else: lastFirst.append(0) #Calculating the length of names for both the entries ln1_length.append(len(name1)) ln2_length.append(len(name2)) ln1.append(name1) ln2.append(name2) namesCross['ln1'] = ln1 namesCross['ln2'] = ln2 namesCross['sameDob'] = dob namesCross['sameGender'] = gender namesCross['sameFather'] = father namesCross['sameFname'] = first namesCross['distFname'] = editdist namesCross['sameLname'] = last namesCross['sameLnameInitial'] = lastFirst namesCross['ln1_len'] = ln1_length namesCross['ln2_len'] = ln2_length return namesCross
if not checker and i is not [ '@', ':', ';', ',', '.', '?', '/', '-' ]: check_porce = pos[k - 2] check_up_porce = pos[k - 1] # print(check_porce) # print(check_up_porce) # print("----break----") return (person_list) # names = get_human_names(pdf_data) # print("LAST, FIRST") for name in names: last_first = HumanName(name).last + ', ' + HumanName( name).first + ', ' + HumanName(name).middle # print("here",last_first) import re import spacy from nltk.corpus import stopwords # load pre-trained model nlp = spacy.load('en_core_web_sm') # Grad all general stop words STOPWORDS = set(stopwords.words('english')) # Education Degrees EDUCATION = [
def parse_name(name): name = HumanName(name) return name