def test_multi_type_cleanups(): expected = "Hello World" errmsg = "cleanup of %s failed" for testname, variation in multi_cleanup_tests.items(): result = cleanco(variation).clean_name(prefix=True, suffix=True, middle=True, multi=True) assert result == expected, errmsg % testname
def clean_co_names(df, col): df['clean_co'] = df[col] df['clean_co'] = df['clean_co'].str.upper() # uppercase print(f'>Set Upper') df['clean_co'] = df['clean_co'].str.replace(',', '') # Remove commas print(f'>Remove commas') df['clean_co'] = df['clean_co'].str.replace(' - ', ' ') # Remove hyphens print(f'>Remove hyphens') df['clean_co'] = df['clean_co'].str.replace( r"\(.*\)", "") # Remove text between parenthesis print(f'>Remove text between parens') df['clean_co'] = df['clean_co'].str.replace(' AND ', ' & ') #replace AND with & print(f'>replace AND with &') df['clean_co'] = df['clean_co'].str.strip( ) # Remove spaces in the begining/end print(f'>Remove leading/trailing spaces') df['clean_co'] = df['clean_co'].apply( lambda x: cleanco(x).clean_name() if type(x) == str else x) # Remove business entities extensions (1) print(f'>Cleanco Pass1') df['clean_co'] = df['clean_co'].str.replace('.', '') # Remove dots print(f'>Remove dots') df['clean_co'] = df['clean_co'].str.encode('utf-8') # Encode print(f'>Encode utf-8') df['clean_co'] = df['clean_co'].apply( lambda x: cleanco(x).clean_name() if type(x) == str else x ) # Remove business entities extensions (2) - after removing the dots print(f'>Cleanco Pass2') return df
def convert_name(name_list): converted_names = [] for name in name_list: #print(name) name = name.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) name_single_spapce = " ".join(name.split()) upper_name = name_single_spapce.upper() cleaned_name = cleanco(upper_name).clean_name() cleaned_name = cleanco(cleaned_name).clean_name() print(cleaned_name) converted_names.append(cleaned_name) return converted_names
def __init__(self, file_name: str, name: str): self.__file_name = file_name self.__name = Utils.replace_redundant_ws(name).lower() self.__cleaned_name = cleanco(self.__name).clean_name() self.__tokens = set(self.__cleaned_name.split(" ")) if len(self.tokens) == 0: raise AssertionError("length of list is zero")
def entity_search(pageContent, response, title=""): """ get entities in pagecontent that are recognized as legal entities """ try: body = h.handle(pageContent) except: try: charset = response.headers.get_content_charset() if charset is None: raise ValueError except: charset = 'utf-8' try: content = pageContent.decode(charset) body = h.handle(content) except Exception as e: print(e, pageContent) return [] body = body.replace("\n", " ").replace("\r", " ").replace("*", " ").replace("#", " ") entities = [] doc = nlp(body) for ent in doc.ents: name = cleanco(str(ent)) if name.type() is not None or name.country() is not None: name_to_add = name_cleaner(str(ent)) if name_to_add is not None: entities.append(name_to_add) elif str(ent).lower() in title: name_to_add = name_cleaner(str(ent)) if name_to_add is not None: entities.append(name_cleaner(str(name_to_add))) return list(dict.fromkeys(entities))
def standardize_name(raw_name): std_name = unidecode(raw_name) std_name = std_name.lower() std_name = std_name.lstrip().strip() std_name = cleanco(std_name).clean_name() std_name = std_name.translate({ord(c): None for c in string.punctuation}) return std_name
def name_cleaner(text, legal_clean=False): if text is None: return None if text == "": return None text = text.lower() for repl in copyright_replace: text = text.replace(repl, "") to_del = re.findall('(\d{4})', text) if len(to_del) > 1: text = text.replace(to_del[0] + "-" + to_del[1], " ") text = text.replace(to_del[0] + " - " + to_del[1], " ") for year in to_del: text = text.replace(year, " ") if len(str(text).strip()) < 2: return None while True: if text[-1] == "." or text[-1] == " ": text = text[:-1] else: break text = re.sub(' +', ' ', text).lower().strip() text = re.sub(r'^[\.?&|*]', '', text) text = re.sub(r'[\.?&|*]$', '', text) if "(" in text: if text.find("(") < text.find(")"): text = text.replace(text[text.find("("):text.find(")")] + ")", " ") if text[0] == ".": text = text[1:] if legal_clean: text = cleanco(text).clean_name() return string.capwords(text.strip())
def imprint_analyzer(domain, link, imprint_queue): """ Loads the imprint URL and returns possible legal names that occur in it """ # Maybe check for adresses (through city/country recognition and highlight the # elements preceding the adress) # Expand list of legal entities in cleanco (through excel list downloaded) elements, imprint_names = [], [] if urlparse(link)[1] == "": end_link = domain + "/" + link elif domain in link: end_link = link else: imprint_queue.put([]) imprint_queue.task_done() return None if "http://" not in end_link and "https://" not in end_link: end_link = "http://" + end_link try: response = urllib.request.urlopen(Request( end_link, headers={'User-Agent': User_ag}), context=context, timeout=10) pageContent = response.read() # pageContent = clean_html(pageContent) except Exception as e: imprint_queue.put([]) return [] tree = html.fromstring(pageContent) titles = [el.text for el in tree.xpath("//*") if el.tag == "title"] tree = tree.xpath("//text()") # get title element - if text is equal to title --> half the similarity ratio for el in tree: if el is None: continue el = el.replace("\n", "").replace("\r", "").replace("\t", "").replace("\\t", "") el = re.sub(' +', ' ', el).lower() if el == "None" or el == "" or el == " ": continue elements.append(el) for el in elements: name = cleanco(el) if name.type() is not None: if len(el) > 50: continue imprint_names.append(el) try: imprint_names.extend(entity_search(pageContent, response)) except: print(response.headers.get_content_charset()) imprint_queue.put((imprint_names, titles)) try: imprint_queue.task_done() except: imprint_queue.put([]) imprint_queue.task_done() return []
def load_stocks(): stocks = requests.get('https://api.iextrading.com/1.0/ref-data/symbols') stocks = stocks.json() stock_map = {cleanco(stock['name']).clean_name(): stock['symbol'] for stock in stocks} for stock in stock_map: if 'WAYFAIR' in stock: print(stock, stock_map[stock])
def cross_validation(title, company_name): """ :param title: The string of the title of the first result that google returns :param company_name: The company name that we use as a searching keyword :return: True if title and keyword share one or more words, False otherwise. """ company_name = company_name.translate( str.maketrans( string.punctuation, ' ' * len(string.punctuation))) # Replace all punctuation with space company_name_set = set( cleanco(company_name).clean_name().lower().split(' ')) title = title.translate( str.maketrans( string.punctuation, ' ' * len(string.punctuation))) # Replace all punctuation with space title_set = set(cleanco(title).clean_name().lower().split(' ')) return len(company_name_set & title_set)
def convert_name(name): if type(name) == str: #print(name) cleaned_name = cleanco(name).clean_name() cleaned_name = cleanco(cleaned_name).clean_name() name = cleaned_name.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) name_split = name.split() # name_copy=name_split.copy() # for s in name_copy: # if len(s)==1: # name_split.remove(s) name_single_spapce = " ".join(name_split) final_name = name_single_spapce.upper() print(final_name) else: final_name = name return final_name
def _clean_text(self, name, lower=True): try: if name: name = name.strip().lower() name = name.translate(str.maketrans(' ', ' ', PUNCT)) name = re.sub('\s\s+', ' ', name) name = cleanco(name).clean_name() #name = name + ' website' name = name.replace(' ', '+') return name return name except Exception as ex: return ''
def word2features(sent, i): word = sent[i][0] postag = sent[i][1] anal = cleanco(word) features = { 'bias': 1.0, 'word.lower()': word.lower(), 'word[-3:]': word[-3:], 'word[-2:]': word[-2:], 'word.isupper()': word.isupper(), 'word.istitle()': word.istitle(), 'word.isdigit()': word.isdigit(), 'postag': postag, 'postag[:2]': postag[:2], 'stop_word': word in stop, 'hyphen': '-' in word, 'size_small': True if len(word) <= 2 else False, #'wordnet_lemmatizer': wordnet_lemmatizer.lemmatize(word), 'stemmer_lanc': lancaster_stemmer.stem(word), #'has_number': hasNumbers(word), #'postag_similar_max': get_similar_words_pos(word) #'gaz_per': True if word in NAMES else False } if i > 0: word1 = sent[i - 1][0] postag1 = sent[i - 1][1] features.update({ '-1:word.lower()': word1.lower(), '-1:word.istitle()': word1.istitle(), '-1:word.isupper()': word1.isupper(), '-1:postag': postag1, '-1:postag[:2]': postag1[:2], }) else: features['BOS'] = True if i < len(sent) - 1: word1 = sent[i + 1][0] postag1 = sent[i + 1][1] features.update({ '+1:word.lower()': word1.lower(), '+1:word.istitle()': word1.istitle(), '+1:word.isupper()': word1.isupper(), '+1:postag': postag1, '+1:postag[:2]': postag1[:2], }) else: features['EOS'] = True return features
def footer_crawler(tree): tree = tree.xpath('//footer//text()') elements, imprint_names = [], [] for element in tree: el = element if el is None: continue el = el.replace("\n", "").replace("\r", "").replace("\t", "") el = re.sub(' +', ' ', el).lower() if el == "None" or el == "" or el == " ": continue elements.append(el) for el in elements: name = cleanco(el) # somehow doesnt recognize "nielen schuman b.v." if name.type() is not None or name.country() is not None: imprint_names.append(name_cleaner(el)) return imprint_names
def cleanMatches(self, matchFrame): stopwords = {'the'} for index, row in matchFrame.iterrows(): tempString = cleanco( matchFrame.iloc[index]['company'].lower()).clean_name() resultwords = [ word for word in re.split("\W+", tempString) if word.lower() not in stopwords ] result = ' '.join(resultwords) matchFrame.at[index, 'Clean Company'] = result.translate( None, string.punctuation) try: matchFrame.at[index, 'Email Domain'] = email_split( row['email']).domain except Exception as e: print 'error ' + str(e) print 'no email' pass
def name_cleaner(df): from cleanco import cleanco df_names = df['name'].fillna(' ') # removal of text between parentheses df_names = df_names.str.replace(r"\(.*\)", "") # 'AND' and '&' are equivalent df_names = df_names.str.replace(' AND ', ' & ') # cleaning utilities from cleanco package (takes off suffixes from a database) df_names = df_names.str.replace('.', '') df_names = df_names.apply(lambda x: cleanco(x).clean_name() if type(x) == str else x) # make all names lower-case df_names = df_names.str.lower() return df_names
def process(self, in_, out): #f = ["year","month","carrier","carrier_name","airport","airport_name","arr_flights","arr_del15","carrier_ct","weather_ct","nas_ct","security_ct","late_aircraft_ct","arr_cancelled","arr_diverted","arr_delay"," carrier_delay","weather_delay","nas_delay","security_delay","late_aircraft_delay"] reader = csv.DictReader(open(in_, "r"), delimiter=",") for row in reader: row["date"] = row["year"] + "-" + row["month"] row["month"] = row["year"] + "-" + row["month"] row["airline"] = cleanco(row["carrier_name"]).clean_name() row["cancelled"] = row["arr_cancelled"] row["delay"] = mk_float(row['late_aircraft_delay']) + mk_float( row['carrier_delay']) + mk_float(row["arr_delay"]) out_r = {} for k in self.fields: try: out_r[k] = row[k] except KeyError: pass out_r["date"] = out_r["date"] + "-01" out.writerow(out_r) continue
def process(self, in_, out): """ :param in_: :param out: csv.DictWriter :return: """ book = xlrd.open_workbook(in_) sheet = book.sheet_by_index(0) for row_index in range(1, sheet.nrows): # A B C D E F G H I J K L M N O P # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 # print(sheet.cell(row_index,1).value) # print(sheet.row(row_index)) # print(type(sheet.cell(row_index,1).value)) try: val = sheet.cell(row_index, 1).value if sheet.cell( row_index, 1).value is not "" else sheet.cell( row_index, 2).value date = datetime.datetime( *xlrd.xldate_as_tuple(val, book.datemode)) airline_name = sheet.cell(row_index, 5).value val = sheet.cell(row_index, 9).value r = { "date": date.date().strftime("%Y-%m-%d"), "month": date.date().strftime("%Y-%m"), "airline": cleanco(airline_name).clean_name(), "item": sheet.cell(row_index, 8).value, "claim_amount": val if val is not "-" else 0 } l = [ sheet.cell(row_index, 2), sheet.cell(row_index, 5), sheet.cell(row_index, 8), sheet.cell(row_index, 9) ] if not "" in r.values() and all(r.values()): out.writerow(r) except Exception as e: pass
def generate_wordcloud(term_field): field = term_field.split('_')[-1] term = '_'.join(term_field.split('_')[:-1]) query = """ SELECT {} FROM web_certificates WHERE cert_id in ( SELECT cert_id FROM cert_search WHERE text MATCH %s ) """ with create_connection() as conn: df = pd.read_sql(query.format(field), conn, params=[term]) df['contractor_clean'] = df[field].apply(lambda x: cleanco(x).clean_name()) relevant_words = [ word.lower().lstrip().rstrip().replace('.', '') for word in df['contractor_clean'] ] relevant_text = " ".join(relevant_words) stopwords = set(STOPWORDS) stopwords.update(general_terms + dvision_terms + term.split(' ')) if field != 'owner': stopwords.update(geographic_locations) try: wordcloud = WordCloud( stopwords=stopwords, background_color=None, mode='RGBA', width=1000, height=400, color_func=lambda *args, **kwargs: "black").generate( relevant_text.upper()) if len(wordcloud.words_): wordcloud.recolor(color_func=grey_color_func, random_state=3) wordcloud.to_file( f"static/wordcloud_{term.replace(' ', '_')}_{field}.png") return len(df), len(wordcloud.words_) / len(df) except ValueError: pass # search term did not generate enough words return len(df), 0
def cleanTargetAccounts(self, targetAccountsFrame): stopwords = {'the'} for index, row in targetAccountsFrame.iterrows(): tempString = cleanco(targetAccountsFrame.iloc[index] ['Account Name'].lower()).clean_name() resultwords = [ word for word in re.split("\W+", tempString) if word.lower() not in stopwords ] result = ' '.join(resultwords) targetAccountsFrame.at[index, 'Clean Target'] = result.translate( None, string.punctuation) try: if 'www.' in row['Website']: targetAccountsFrame.at[index, 'Website Host'] = urlparse( row['Website']).path.split('.')[1].lower() else: targetAccountsFrame.at[index, 'Website Host'] = urlparse( row['Website']).path.split('.')[0].lower() except Exception as e: targetAccountsFrame.at[index, 'Website Host'] = 'None' pass
def clean_company_name(raw): if raw in (" ", "None", None): return "" name = unidecode.unidecode(raw) try: name = re.findall("o/a (.*)", name, flags=re.I)[0] except IndexError: pass try: name = re.findall("c/o (.*)", name, flags=re.I)[0] except IndexError: pass try: name = re.findall("(.*) for ", name, flags=re.I)[0] except IndexError: pass name = cleanco(name).clean_name() name = name.lower() for stopword in ["of", "d'", "l'"]: name = name.replace(stopword, "") name = name.replace("and", "&") for punct in ["-", ".", ",", "(", ")"]: name = name.replace(punct, " ") for punct in ["'"]: name = name.replace(punct, "") if (not name.startswith("s ")) and (not " s " in name): name = " ".join([word.rstrip("s") for word in name.split(" ")]) name = "".join([word for word in name.split(" ")]) for word in [ "constructor", "construction", "contracting", "contractor", "mechanical", "plumbing", "heating", "mech", "electrical", "electric", "development", "interior" "builders", "building", "enterprise", "infrastructure", "management", "excavating", "trucking", "company", "restoration", "service", "servicing", "hvac", "system", "paving", "industrie", "industry", "engineering", "consulting", "consultant", "solution", "commercial", "group", "insulation", "insulators", "ontario", "canada", ]: name = name.replace(word, "") return name
def test_with_unicode_umlauted_name(): errmsg = "preserving cleanup of %s failed" for testname, (variation, expected) in unicode_umlaut_tests.items(): assert cleanco(variation).clean_name() == expected, errmsg % testname
def test_preserving_cleanups(): errmsg = "preserving cleanup of %s failed" for testname, (variation, expected) in preserving_cleanup_tests.items(): assert cleanco(variation).clean_name() == expected, errmsg % testname
def clean_company_name(s): s = process_string(s) cleaned = cleanco(s) return cleaned.clean_name()
def test_basic_cleanups(): expected = "Hello World" errmsg = "cleanup of %s failed" for testname, variation in basic_cleanup_tests.items(): assert cleanco(variation).clean_name() == expected, errmsg % testname
def get_keyword_candidates(NounPhrase, doc_title): ''' Get keyword candidates from list of noun phrase by term frequency and if the phrase is in keyword - filter out city, US States and territories, country, region state_names - filter out domain specific stop words - filter out annual event (Super Bowl, Oscars, Grammys, etc.), politics word (Islam, terrorism, white house), natural disaster (flood, hurricane) Parameters ----------- NounPhrase: list a list of noun phrases extracted from the article title: str the title of the article where the noun phrases come from Returns ------- reranked: list a list of keyword candidates ''' import nltk import re import collections from fuzzywuzzy import fuzz from cleanco import cleanco # remove company suffix kw = [] for np in NounPhrase: x = cleanco(np) kw.append(x.clean_name()) # print(kw) kw0 = [ re.sub('(Companys|Company|Companies|Firm|Organization|Corporation)', '', k) for k in kw ] kw0 = [k.strip() for k in kw0] # remove whitespace # remove leading, tailing, between-character punctuation punctuation = '’!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~' kw1 = [] for k in kw0: k = ''.join(ch for ch in k if ch not in punctuation) if k and k != ' ': # remove empty string kw1.append(k) # print(kw1) # remove irrelevant keyword candidates ###### substring ###### type1 = [ 'Percent', 'Major', 'Vote', 'Unit', 'Method', 'Option', 'Euro', 'Angel', 'Offer', 'Market', 'Review', 'Nation', 'Present', 'Direct', 'Terror', 'Islam', 'FY', 'Holder' ] kw2 = [ word for word in kw1 if not any(w.lower() in word.lower() for w in type1) ] # print(kw2) ###### subword ###### import calendar import us from geotext import GeoText # time related words time_ls = [weekday for weekday in calendar.day_name] time_ls.extend([m for m in calendar.day_abbr ]) # weekday: Monday, Tuesday, etc. time_ls.extend([month for month in calendar.month_name[1:] ]) # month: January, February, etc. time_ls.extend([m for m in calendar.month_abbr[1:] ]) # month name abbreviation: Jan, Feb, Mar, etc. type2 = [ 'ISIS', 'Info', 'Cent', 'Part', 'RMB', 'EPS', 'SAR', 'IFRS', 'Plan', 'Deal', 'Time', 'Age', 'Rate', 'NYSE', 'GNP', 'REIT', 'Mr', 'Mrs', 'Ms', 'Co', 'Bn', 'Get', 'Bad', 'Dow', 'River', 'Lead', 'Employer', 'Difference', 'ROI', 'AMEX', 'IRA', 'DJIA', 'NAV', 'PSP', 'FOREX', 'EFT', 'ETF', 'FDIC', 'FRB', 'LOI', 'NAV', 'SEC', 'YTM', 'NDA', 'SP', 'DC', 'etc', 'Zone', 'Such', 'SEK', 'Army', 'CFA', 'Net', 'Lake', 'Hotel', 'HKD', 'IST', 'Side', 'EBITDA', 'FASB', 'FBMS', 'FDIC', 'GDP', 'BFY', 'OWCP', 'Gov', 'BLS', 'DOL', 'FDA', 'Site', 'EIS', 'Page', 'New', 'News', 'Old', 'Ltd', 'Corp', 'Task', 'Park', 'Esq', 'Tower', 'State', 'Return', 'War', 'Snow', 'Sign', 'Step', 'Sale', 'NASDAQ', 'Job', 'No', 'CAGR', 'Discount', 'FBI', 'IRS', 'Cash', 'IRR', 'Tax', 'Taxation', 'Sir', 'Goal', 'Poor', 'Poors', 'ID', 'CPA', 'Hall', 'Stake', 'Association', 'Provision', 'Way', 'Fact', 'Idea', 'Second', 'First', 'Half', 'Role', 'Big', 'Act', 'Share', 'DOJ', 'Sum', 'ASX', 'PhD', 'Line', 'Risk', 'Right', 'Rule', 'Read', 'See', 'TSX', 'Fed', 'IDF', 'NZX', 'Lot', 'Name', 'Soldier', 'Storm', 'Loss', 'Gain', 'Person', 'Late', 'Team', 'Debt', 'Cost', 'Same', 'Last', 'Only', 'Area', 'Earnings', 'Earnings', 'Related', 'Performance', 'Palace', 'Temple', 'Stages', 'Inc', 'FTSE', 'Further', 'Rain', 'Investigation' 'Year', 'Month', 'Quarter', 'Day', 'Morning', 'Evening', 'Afternoon', 'Date', 'Week', 'Hour', 'Minute', 'Period', 'Certain', 'Member', 'Republic', 'Prospect', 'Senate', 'Growth', 'Oscars', 'Source', 'Grammys', 'Clinton', 'Trump', 'Obama', 'Election', 'Federal', 'Congress', 'Brand', 'Exchange', 'Authority', 'Requirement', 'Additional', 'Purchase', 'Esquire', 'Institute', 'Place', 'Crime', 'NBA', 'Available', 'EU', 'Party', 'Government', 'Department', 'Ministry', 'Minister', 'President', 'Cabinet', 'Court', 'Bureau', 'Country', 'Society', 'Capitol', 'Assumption', 'Litte', 'Gross', 'Corporate', 'Said', 'Shares', 'UN', 'Office', 'Officer', 'Board', 'Police', 'Law', 'Attorney', 'Analyst', 'Council', 'Street', 'Union', 'Branch', 'Request', 'Saving', 'Study', 'Expense', 'Strong', 'Per', 'Appendix', 'Billion', 'Competitive', 'Now', 'Headquarters', 'University', 'College', 'Instituition', 'School', 'Academy', 'Airport', 'Station', 'Property', 'Avenue', 'Place', 'Quantity', 'Attachment', 'Next', 'Title', 'Yield', 'Ill', 'Sept', 'CAD', 'Top', 'Statement', 'Statements', 'Report', 'Reports', 'Sheet', 'Sheets', 'Session', 'Term', 'Charter', 'Assessment', 'Application', 'Instruction', 'Publication', 'Period', 'Chapter', 'Weather', 'Times', 'EUR', 'Chair', 'Document', 'Information', 'Transaction', 'Content', 'Press', 'Release', 'Journal', 'Form', 'Description', 'Section', 'Subsidiary', 'Attached', 'Editions', 'Relevant', 'Comment', 'Liquidity', 'Fortune', 'Free', 'Agreement', 'Settlement', 'Filing', 'File', 'Award', 'Awards', 'Patent', 'Copyright', 'Strategy', 'Price', 'Asset', 'Factor', 'Documentation', 'Impact', 'Initiative', 'Several', 'Further', 'Choice', 'Sq', 'Ft', 'Dividend', 'Profit', 'Income', 'Revenue', 'Margin', 'Interest', 'Influence', 'Problem', 'Securities', 'Currency', 'Great', 'Wrong', 'Claim', 'Proceeding', 'Strategic', 'Decision', 'Merge', 'Decline', 'Europe', 'Security', 'Bond', 'Profile', 'Portfolio', 'Ratio', 'Rating', 'Value', 'Credit', 'Audit', 'Future', 'Instrument', 'Instruments', 'Policy', 'Other', 'Different', 'Expectation', 'Olympic', 'Decrease', 'Australia', 'Finance', 'Financial', 'Financing', 'Component', 'Trade', 'Forecast', 'Prediction', 'Buy', 'Sell', 'Index', 'Staff', 'Concern', 'Expenditure', 'Justice', 'Edition', 'Inflation', 'Increase', 'Continue', 'Africa', 'Business', 'Valuation', 'Series', 'Condition', 'Disclosure', 'Regulation', 'Committee', 'Rating', 'Stock', 'Excahnge', 'Quality', 'Spokesman', 'Competition', 'Serious', 'Average', 'Balance', 'Table', 'America', 'Acquisition', 'Outlook', 'Prospectus', 'Stage', 'Executive', 'Budget', 'Investor', 'Owner', 'Leader', 'Acknowledgement', 'Overall', 'Competitor', 'Daily', 'Current', 'Medal', 'Buyouts', 'Allowance', 'Tsunami', 'Announcement', 'Development', 'Account', 'Demand', 'Dollar', 'Crore', 'Pound', 'Number', 'Round', 'Many', 'Range', 'Relationship', 'Important', 'Chairman', 'Improvement', 'Allocation', 'Buyout', 'Flight', 'UK', 'Assembly', 'Meeting', 'Conference', 'Access', 'Archive', 'Exhibit', 'Opportunity', 'Chance', 'Responsibility', 'Parameter', 'Later', 'Key', 'Hundred', 'Estimate', 'Phase', 'Judge', 'Governor', 'Asia', 'Drought', 'Item', 'Product', 'Issue', 'Type', 'Class', 'Category', 'Amount', 'Result', 'Notes', 'Event', 'Order', 'Basis', 'Previous', 'Employee', 'Thousand', 'Summary', 'Chief', 'Position', 'Festival', 'Note', 'Earthquake', 'Enquiry', 'Question', 'Answer', 'Reference', 'Action', 'Story', 'Headline', 'World', 'Article', 'Figure', 'Promotion', 'Certification', 'Level', 'Million', 'Notification', 'Principal', 'Did', 'Road', 'Flood', 'US', 'High', 'Low', 'Total', 'Enough', 'Good', 'Recent', 'Annual', 'Above', 'Detail', 'Aggregate', 'Former', 'Manager', 'Effect', 'Thing', 'Standard', 'Deposit', 'Notice', 'Mortagage', 'Certificate', 'Agent', 'Hurricane' ] type2.extend(time_ls) kw3 = [ word for word in kw2 if not any(w.lower() in word.lower().split() for w in type2) ] # print(kw3) ###### the whole term ###### type3 = [ 'Hongkong', 'Calif', 'Isarel', 'Korea', 'England', 'Britain', 'Agency', 'USA', 'U.S.A', 'U.S.', 'U.S', 'U.K.', 'UKs', 'UAE', 'Antarctica', 'Store', 'Silicon Valley', 'West', 'East', 'North', 'South', 'Northwest', 'Service Provider', 'Trust', 'Management', 'Partner', 'Program', 'Group', 'Super Bowl', 'Limited Partner', 'General Partner', 'San', 'Southwest', 'Justice', 'Commerce', 'Head', 'Due Diligence', 'District', 'World', 'Square Foot', 'PartnerSite', 'Parent Company', 'Don', 'Northeast', 'Partnership', 'Platform', 'Organization', 'Corporation', 'Startup', 'Bank', 'Industry', 'Sector', 'Segment', 'White House', 'Project', 'Research', 'Technology', 'Science', 'Operation', 'Capital', 'Capitalization', 'Investment', 'IPO', 'Investment', 'Tech', 'Engineering', 'Fund', 'Seed', 'Venture Capital', 'Private Equity', 'Corporate Venture', 'Incubator', 'Accelerator', 'Customer', 'Commission', 'Secretary', 'Client', 'Customer Service', 'Shop', 'Restaurant', 'City', 'Facility', 'Joint Venture', 'Website', 'Internet', 'Region', 'Function', 'General', 'House', 'Appointment', 'Change', 'Founder', 'Author', 'Analysis', 'Full Text', 'Amendment', 'Venture', 'Free Online', 'Life', 'Treasury', 'Center' ] # any US state or territories name state_names = [state.name for state in us.states.STATES_AND_TERRITORIES] type3.extend(state_names) kw4 = [ word for word in kw3 if word.lower() not in [w.lower() for w in type3] ] # remove any country/city name kw4 = [ word for word in kw4 if not GeoText(word.title()).countries if not GeoText(word.title()).cities ] # print(kw4) ###### Remove word length equals 1 and not noun/np term ###### kw5 = [] good_tag = ['NNP', 'NN'] for word in kw4: if len(word.split()) == 1: for word, tag in nltk.pos_tag(word.split()): if tag in good_tag: kw5.append(word) else: kw5.append(word) # print(kw5) # 'Global shares', 'global Shares','global shares','GLOBAL SHARES' -> 'Global Shares' if it exists in the list kw6 = [w.title() if w.title() in kw5 else w for w in kw5] # print(kw6) # count frequency c = collections.Counter(kw6) # 'Virtual Reality Technology','Virtual Reality','Technology' -> 'Virtual Reality Technology', 'Technology' kw6 = list( sorted(set(kw6), key=len, reverse=True)) # arrange in descending order of term length for i, word in enumerate(kw6): for j in range(i + 1, len(kw6)): if len(kw6[i].split()) == len( kw6[j].split()) and fuzz.token_set_ratio(kw6[i], kw6[j]) == 100: c[kw6[i]] += c[kw6[j]] del c[kw6[j]] elif len(kw6[i].split()) > len(kw6[j].split()) and kw6[i].split( )[0].lower() == kw6[j].split()[0].lower(): c[kw6[j]] += c[kw6[i]] del c[kw6[i]] # 'Technology','Virtual Reality Technology' -> 'Virtual Reality Technology' # 'Agency', 'Estate Agency','Real Estate Agency' -> 'Real Estate Agency' kw7 = sorted([key for key, value in c.items()], key=len) # arrange in ascending order of term length for i, word in enumerate(kw7): for j in range(i + 1, len(kw7)): if len(kw7[i].split()) < len(kw7[j].split()) and kw7[i].split( )[-1].lower() == kw7[j].split()[-1].lower(): c[kw7[j]] += c[kw7[i]] del c[kw7[i]] # print(kw7) # Remove noun phrases with length greater than 3 words and shorter than 2 character # sort by occurring frequency sorted_kw = [ key for key, value in sorted(c.items(), key=lambda x: x[1], reverse=True) if len(key) > 2 and len(key.split()) < 4 ] # Rerank by giving more weight to terms occurred in the title kw_scores = collections.OrderedDict() for kw in sorted_kw: pattern = re.compile(r'\b' + re.escape(kw) + r'(\b|[,;.!?]|\s)', re.IGNORECASE) if pattern.search(doc_title): in_title = 1 else: in_title = 0 kw_scores[kw] = in_title in_title_list = [] notin_title_list = [] for term in kw_scores.items(): if term[1] == 1: in_title_list.append(term[0]) else: notin_title_list.append(term[0]) reranked = in_title_list + notin_title_list return reranked
train_df['oscore'] = train_df.clean_beneficiary.apply( lambda x: get_score(x, o_word_dict)) return train_df if __name__ == '__main__': train_path = '/Users/aditya1/Downloads/DIAFTE-master 2/data/CSSol-2/NewData/whole_train_new.csv' test_path = '/Users/aditya1/Downloads/DIAFTE-master 2/data/CSSol-2/NewData/whole_test_new.csv' test_df = pd.read_csv(test_path) train_df = pd.read_csv(train_path) # If don't want to reset index train_df = train_df[train_df.columns[1:]] test_df = test_df[test_df.columns[1:]] train_df['clean_beneficiary'] = train_df.beneficiary.apply( lambda x: clean_beneficiary(str(x))) test_df['clean_beneficiary'] = test_df.beneficiary.apply( lambda x: clean_beneficiary(str(x))) train_df['clean_beneficiary'] = train_df.clean_beneficiary.apply( lambda x: cleanco(str(x)).clean_name()) test_df['clean_beneficiary'] = test_df.clean_beneficiary.apply( lambda x: cleanco(str(x)).clean_name()) train_df = main(train_df) print(train_df) # test_df = main(test_df) # train_df.to_csv('updated_train_new.csv', index=False) # test_df.to_csv('updated_test_new.csv', index=False)
def get_country_from_company_name(s): cleaned = cleanco(s) return cleaned.country()
from cleanco import cleanco business_name = "Hello World, llc." print("Inputted Business Name: %s" % business_name) x = cleanco(business_name) print("Clean Name: %s" % x.clean_name()) print("Business Type: %s" % x.type()) print("Country: %s" % x.country())
def ngrams(string, n=2): x = cleanco(str(string).lower()).clean_name() ngrams = zip(*[x[i:] for i in range(n)]) return [''.join(ngram) for ngram in ngrams]
def test_commas(): name = "bp, kek, llc" cleaned_name = cleanco(name).clean_name() assert cleaned_name == "bp, kek"
def find_contacts_at_companies(input_companies_fname, input_contacts_fname): # Initialize lists target_company_list = [] contacts_matrix = [] contacts_matrix_noblanks = [] contacts_at_companies_matrix = [] any_contacts_at_company = False contact_match_count = 0 companies_with_matches = [] # Create constants that correspond to field positions. COMPANYNAME_OF_COMPANIES = 0 COMPANYNAME_OF_COMPANIES_CLEANSED = 1 FULLNAME_OF_CONTACT = 0 EMAIL1_OF_CONTACT = 30 EMAIL2_OF_CONTACT = 31 COMPANY_OF_CONTACT = 64 COMPANY_OF_CONTACT_CLEANSED = 65 TITLE_OF_CONTACT = 66 + 1 DEPARTMENT_OF_CONTACT = 67 + 1 # Read data into lists from files with open(input_companies_fname, encoding='utf-8') as csvDataFile: csvReader = csv.reader(csvDataFile) for row in csvReader: target_company_list.append(row) with open(input_contacts_fname, encoding='utf-8') as csvDataFile: csvReader = csv.reader(csvDataFile) for row in csvReader: contacts_matrix.append(row) # Pop off the top row of each input containing the text tile row. target_company_list.pop(0) contacts_matrix.pop(0) print("Number of target accounts: ", len(target_company_list)) print("Number of contacts in database: ", len(contacts_matrix)) # Insert a cleansed version of the company name in each row in the company # data and in the contact data. # Insert a cleansed company field in the target company list data. for row_companies in target_company_list: row_companies.insert(COMPANYNAME_OF_COMPANIES_CLEANSED, row_companies[COMPANYNAME_OF_COMPANIES]) # Clean company control names like "Inc.," "Incorporated," etc. for row_companies in target_company_list: row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED] = cleanco( row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED]).clean_name() # Remove punctuation row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED] = regex.sub( r"[[:punct:]]+", "", row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED]) # Make lower case and handle certain foreign language capitalization # conventions with casefold(). row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED] = row_companies[ COMPANYNAME_OF_COMPANIES_CLEANSED].casefold() row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED] = nltk.word_tokenize( row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED]) row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED] = [ t for t in row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED] if t not in stopwords.words('english') ] # Get rid of all contacts without a company using list comprehension. contacts_matrix_noblanks = [ row for row in contacts_matrix if (row[COMPANY_OF_CONTACT] != "") ] # Now cleanse the contact company names. for row_contacts in contacts_matrix_noblanks: row_contacts.insert(COMPANY_OF_CONTACT_CLEANSED, row_contacts[COMPANY_OF_CONTACT]) for row_contacts in contacts_matrix_noblanks: row_contacts[COMPANY_OF_CONTACT_CLEANSED] = cleanco( row_contacts[COMPANY_OF_CONTACT_CLEANSED]).clean_name() row_contacts[COMPANY_OF_CONTACT_CLEANSED] = regex.sub( r"[[:punct:]]+", "", row_contacts[COMPANY_OF_CONTACT_CLEANSED]) row_contacts[COMPANY_OF_CONTACT_CLEANSED] = row_contacts[ COMPANY_OF_CONTACT_CLEANSED].casefold() row_contacts[COMPANY_OF_CONTACT_CLEANSED] = nltk.word_tokenize( row_contacts[COMPANY_OF_CONTACT_CLEANSED]) row_contacts[COMPANY_OF_CONTACT_CLEANSED] = [ t for t in row_contacts[COMPANY_OF_CONTACT_CLEANSED] if t not in stopwords.words('english') ] # Walk through the companies list. For each company, go through companies # of contacts (in the noblanks matrix) and find matches or approximate # matches where for row_companies in target_company_list: any_contacts_at_company = False c = row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED] for row_contacts in contacts_matrix_noblanks: coc = row_contacts[COMPANY_OF_CONTACT_CLEANSED] if c == coc or (set(coc) < set(c)) or (set(c) < set(coc)): contacts_at_companies_matrix.append([ row_companies[COMPANYNAME_OF_COMPANIES], row_contacts[COMPANY_OF_CONTACT], row_contacts[FULLNAME_OF_CONTACT], row_contacts[TITLE_OF_CONTACT], row_contacts[DEPARTMENT_OF_CONTACT], row_contacts[EMAIL1_OF_CONTACT], row_contacts[EMAIL2_OF_CONTACT] ]) any_contacts_at_company = True contact_match_count += 1 companies_with_matches.append( row_companies[COMPANYNAME_OF_COMPANIES]) if any_contacts_at_company == False: contacts_at_companies_matrix.append( [row_companies[COMPANYNAME_OF_COMPANIES]]) contacts_at_companies_matrix.insert(0, [ "Company Name", "Contact Company Name", "Contact Full Name", "Contact Title", "Contact Department", "Contact Email1", "Contact Email2" ]) companies_with_matches = list(sorted(set(companies_with_matches))) print("Number of target accounts with contacts: ", len(companies_with_matches)) print("Number of contacts found at target accounts: ", contact_match_count) return contacts_at_companies_matrix, companies_with_matches