def _expand_name(df, col): df[col] = preprocessing.clean(df[col]) df[col + "---initial"] = df[col].str[0] df[col + "---soundex"] = preprocessing.phonetic(df[col], method="soundex") df[col + "---nysiis"] = preprocessing.phonetic(df[col], method="nysiis") df[col + "---metaphone"] = preprocessing.phonetic(df[col], method="metaphone")
def test_phonetic_does_not_exist(self): values = pd.Series([ np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha', u'Micheal', u'Sjors' ]) with pytest.raises(ValueError): phonetic(values, 'unknown_algorithm')
def preproc_attributes(df: pd.DataFrame, names: List[str]) -> pd.DataFrame: for n in names: df[n + '_clean'] = preprocessing.clean(df[n]) df[n + '_soundex'] = preprocessing.phonetic(df[n + '_clean'], 'soundex') df[n + '_nysiis'] = preprocessing.phonetic(df[n + '_clean'], 'nysiis') df[n + '_metaphone'] = preprocessing.phonetic(df[n + '_clean'], 'metaphone') df[n + '_match_rating'] = preprocessing.phonetic( df[n + '_clean'], 'match_rating') return df
def test_encode_metaphone(self): values = pd.Series([ np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha', u'Micheal', u'Sjors' ]) expected = pd.Series( [np.nan, u'JN', u'MRYN', u'BL', u'JN0N', u'KR0', u'MXL', u'SJRS']) phon = phonetic(values, method='metaphone') pdt.assert_series_equal(phon, expected)
def test_encode_nysiis(self): values = pd.Series([ np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha', u'Micheal', u'Sjors' ]) expected = pd.Series([ np.nan, u'JAN', u'MARYAN', u'BALY', u'JANATAN', u'GRAT', u'MACAL', u'SJAR' ]) phon = phonetic(values, 'nysiis') pdt.assert_series_equal(phon, expected)
def test_encode_soundex(self): values = pd.Series([ np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha', u'Micheal', u'Sjors' ]) expected = pd.Series([ np.nan, u'J500', u'M650', u'B400', u'J535', u'G630', u'M240', u'S620' ]) phon = phonetic(values, 'soundex') pdt.assert_series_equal(phon, expected)
def test_encode_match_rating(self): values = pd.Series([ np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha', u'Micheal', u'Sjors' ]) expected = pd.Series([ np.nan, u'JHN', u'MRYNN', u'BLLY', u'JNTHN', u'GRTH', u'MCHL', u'SJRS' ]) phon = phonetic(values, method='match_rating') pdt.assert_series_equal(phon, expected)
def test_encode_match_rating(self): values = pd.Series([ np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha', u'Micheal', u'Sjors' ]) # in jellyfish.match_rating_codex version 0.8.0 results have changed expected = pd.Series([ np.nan, u'JHN', u'MRYN', u'BLY', u'JNTHN', u'GRTH', u'MCHL', u'SJRS' ]) phon = phonetic(values, method='match_rating') pdt.assert_series_equal(phon, expected)
def run_phonetic_encoding(df_a, select_encoding): """ Calulate the phonetic encoding of the selected fields """ logging.info("run phonetic encoding ....") df_a_processed = df_a.copy() #FIXME Errors when selecting non string columns like soc_sec_id #TODO Include double metaphone in Python Toolkit for field, encoding in select_encoding.items(): if (encoding == 'double_metaphone'): df_a_processed[encoding + "_" + field] = df_a[field].apply( lambda x: doublemetaphone(str(x))[0] if (np.all(pd.notnull(x))) else x) else: df_a_processed[encoding + "_" + field] = phonetic(clean( df_a[field]), method=encoding) cols = df_a_processed.columns.to_list() return df_a_processed, cols
def get_features(df): """Clean and engineer new features for match comparison.""" # remove non-alpha characters from names and generate phonetic codes: df['Original Name_cleaned'] = clean(df['Original Name']) df['o_name_phonetic_code'] = phonetic(df['Original Name_cleaned'], method="soundex") df['Age'] = df['Age'].astype('float') #df['add2_lat'] = df['add2_lat'].astype('float') #df['add2_long'] = df['add2_long'].astype('float') # remove non-numeric characters from phone numbers and save as strings for levenshtein comparison: df['Phone'] = df['Phone'].str.replace(r'[^0-9]+', '') df['Phone'] = df['Phone'].astype(str) # remove all characters that aren't part of a URL: df['social_media'] = df['social_media'].apply(lambda x: re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(x))).sort_values().str[0] df['social_media'] = df['social_media'].fillna(np.nan) return df
serie = pandas.Series(etichette.keys()) serie serie = pandas.Series(list(etichette.keys())) serie clean(serie) clean(serie, replace_by_none=None, strip_accents='unicode') clean(serie, replace_by_none=None, strip_accents='ascii') clean(serie, replace_by_none=None, strip_accents='unicode') discogs discogs_df from recordlinkage.preprocessing import phonetic get_ipython().run_line_magic('pinfo', 'phonetic') serie[66] giappa = pandas.Series([serie[66]]) giappa phonetic(giappa, 'soundex') phonetic(giappa, 'metaphone') phonetic(giappa, 'nysiis') phonetic(giappa, 'match_rating') get_ipython().run_line_magic('hist', '') cosa = pandas.Series(['àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúū']) clean(cosa) clean(cosa, replace_by_none=None) clean(cosa, replace_by_none=None, strip_accents='ascii') clean(cosa, replace_by_none=None, strip_accents='unicode') etichette #names = [] ''' хартшорн, чарльзcharles hartshorne, 'チャールズ・ハートショーン': 'Q1064777', 'تشارلز هارتشورن': 'Q1064777',
import pandas as pd import numpy as np # record linkage and preprocessing import recordlinkage as rl from recordlinkage.preprocessing import clean, phonetic #inventors = pd.read_csv('../Data/patentsview_inventors.csv') assignees = pd.read_csv('../Data/patentsview_assignee_org_names.csv') organizations = pd.read_csv('../Data/fedreporter_org_names_grants.csv') #Do cleaning for assignee assignees.head() assignees['organizations_clean'] = clean(assignees['organization']) assignees['organization'] = assignees['organizations_clean'].str.replace(' ','') assignees["organization_phonetic"] = phonetic(assignees["organization"], method="nysiis") assignees = assignees.loc[assignees.org_country == "US",:] #Do cleaning for organization organizations[' ORGANIZATION_NAME_CLEAN'] = clean(organizations[' ORGANIZATION_NAME']) organizations['organization'] = organizations[' ORGANIZATION_NAME_CLEAN'].str.replace(' ','') organizations['organization_phonetic'] = phonetic(organizations["organization"], method="nysiis") organizations = organizations.loc[organizations[' ORGANIZATION_COUNTRY'] == "UNITED STATES",:] print(assignees.shape) print(assignees.patent_id.drop_duplicates().shape) #Explore stuff assignees.head() organizations.head()
'', regex=True) forbes_pre["Company"].str.count(r'\bhldg\b').sum() forbes_pre['Company'] = forbes_pre["Company"].str.replace(r'\bhldg\b', '', regex=True) forbes_pre["Company"].str.count(r'\bpcl\b').sum() forbes_pre['Company'] = forbes_pre["Company"].str.replace(r'\bplc\b', '', regex=True) forbes_pre["Company"].str.count(r'\bthe\b').sum() forbes_pre['Company'] = forbes_pre["Company"].str.replace(r'\bthe\b', '', regex=True) sp500_pre['Security_sort'] = phonetic(sp500_pre['Security'], method="soundex") forbes_pre['Company_sort'] = phonetic(forbes_pre['Company'], method="soundex") len(forbes_pre[forbes_pre['Industry'].isna() | forbes_pre['Sector'].isna()]) # FULL indexer = rl.Index() indexer.full() candidates = indexer.index(forbes_pre, sp500_pre) print(len(candidates)) # SORTED NEIGHBORHOOD indexer = rl.Index() indexer.sortedneighbourhood(left_on="Security_sort",