Esempio n. 1
0
 def _expand_name(df, col):
     df[col] = preprocessing.clean(df[col])
     df[col + "---initial"] = df[col].str[0]
     df[col + "---soundex"] = preprocessing.phonetic(df[col],
                                                     method="soundex")
     df[col + "---nysiis"] = preprocessing.phonetic(df[col],
                                                    method="nysiis")
     df[col + "---metaphone"] = preprocessing.phonetic(df[col],
                                                       method="metaphone")
Esempio n. 2
0
    def test_phonetic_does_not_exist(self):

        values = pd.Series([
            np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha',
            u'Micheal', u'Sjors'
        ])

        with pytest.raises(ValueError):
            phonetic(values, 'unknown_algorithm')
Esempio n. 3
0
def preproc_attributes(df: pd.DataFrame, names: List[str]) -> pd.DataFrame:
    for n in names:
        df[n + '_clean'] = preprocessing.clean(df[n])
        df[n + '_soundex'] = preprocessing.phonetic(df[n + '_clean'],
                                                    'soundex')
        df[n + '_nysiis'] = preprocessing.phonetic(df[n + '_clean'], 'nysiis')
        df[n + '_metaphone'] = preprocessing.phonetic(df[n + '_clean'],
                                                      'metaphone')
        df[n + '_match_rating'] = preprocessing.phonetic(
            df[n + '_clean'], 'match_rating')
    return df
Esempio n. 4
0
    def test_encode_metaphone(self):

        values = pd.Series([
            np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha',
            u'Micheal', u'Sjors'
        ])
        expected = pd.Series(
            [np.nan, u'JN', u'MRYN', u'BL', u'JN0N', u'KR0', u'MXL', u'SJRS'])

        phon = phonetic(values, method='metaphone')

        pdt.assert_series_equal(phon, expected)
Esempio n. 5
0
    def test_encode_nysiis(self):

        values = pd.Series([
            np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha',
            u'Micheal', u'Sjors'
        ])
        expected = pd.Series([
            np.nan, u'JAN', u'MARYAN', u'BALY', u'JANATAN', u'GRAT', u'MACAL',
            u'SJAR'
        ])

        phon = phonetic(values, 'nysiis')

        pdt.assert_series_equal(phon, expected)
Esempio n. 6
0
    def test_encode_soundex(self):

        values = pd.Series([
            np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha',
            u'Micheal', u'Sjors'
        ])
        expected = pd.Series([
            np.nan, u'J500', u'M650', u'B400', u'J535', u'G630', u'M240',
            u'S620'
        ])

        phon = phonetic(values, 'soundex')

        pdt.assert_series_equal(phon, expected)
    def test_encode_match_rating(self):

        values = pd.Series([
            np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha',
            u'Micheal', u'Sjors'
        ])
        expected = pd.Series([
            np.nan, u'JHN', u'MRYNN', u'BLLY', u'JNTHN', u'GRTH', u'MCHL',
            u'SJRS'
        ])

        phon = phonetic(values, method='match_rating')

        pdt.assert_series_equal(phon, expected)
Esempio n. 8
0
    def test_encode_match_rating(self):

        values = pd.Series([
            np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha',
            u'Micheal', u'Sjors'
        ])
        # in jellyfish.match_rating_codex version 0.8.0 results have changed
        expected = pd.Series([
            np.nan, u'JHN', u'MRYN', u'BLY', u'JNTHN', u'GRTH', u'MCHL',
            u'SJRS'
        ])

        phon = phonetic(values, method='match_rating')

        pdt.assert_series_equal(phon, expected)
Esempio n. 9
0
def run_phonetic_encoding(df_a, select_encoding):
    """ Calulate the phonetic encoding of the selected fields """

    logging.info("run phonetic encoding ....")
    df_a_processed = df_a.copy()

    #FIXME Errors when selecting non string columns like soc_sec_id
    #TODO Include double metaphone in Python Toolkit
    for field, encoding in select_encoding.items():
        if (encoding == 'double_metaphone'):
            df_a_processed[encoding + "_" + field] = df_a[field].apply(
                lambda x: doublemetaphone(str(x))[0]
                if (np.all(pd.notnull(x))) else x)
        else:
            df_a_processed[encoding + "_" + field] = phonetic(clean(
                df_a[field]),
                                                              method=encoding)

    cols = df_a_processed.columns.to_list()

    return df_a_processed, cols
Esempio n. 10
0
def get_features(df):
    """Clean and engineer new features for match comparison."""

    # remove non-alpha characters from names and generate phonetic codes:
    df['Original Name_cleaned'] = clean(df['Original Name'])
    df['o_name_phonetic_code'] = phonetic(df['Original Name_cleaned'],
                                          method="soundex")

    df['Age'] = df['Age'].astype('float')

    #df['add2_lat'] = df['add2_lat'].astype('float')
    #df['add2_long'] = df['add2_long'].astype('float')

    # remove non-numeric characters from phone numbers and save as strings for levenshtein comparison:
    df['Phone'] = df['Phone'].str.replace(r'[^0-9]+', '')
    df['Phone'] = df['Phone'].astype(str)

    # remove all characters that aren't part of a URL:
    df['social_media'] = df['social_media'].apply(lambda x: re.findall(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        str(x))).sort_values().str[0]
    df['social_media'] = df['social_media'].fillna(np.nan)

    return df
serie = pandas.Series(etichette.keys())
serie
serie = pandas.Series(list(etichette.keys()))
serie
clean(serie)
clean(serie, replace_by_none=None, strip_accents='unicode')
clean(serie, replace_by_none=None, strip_accents='ascii')
clean(serie, replace_by_none=None, strip_accents='unicode')
discogs
discogs_df
from recordlinkage.preprocessing import phonetic
get_ipython().run_line_magic('pinfo', 'phonetic')
serie[66]
giappa = pandas.Series([serie[66]])
giappa
phonetic(giappa, 'soundex')
phonetic(giappa, 'metaphone')
phonetic(giappa, 'nysiis')
phonetic(giappa, 'match_rating')
get_ipython().run_line_magic('hist', '')
cosa = pandas.Series(['àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúū'])
clean(cosa)
clean(cosa, replace_by_none=None)
clean(cosa, replace_by_none=None, strip_accents='ascii')
clean(cosa, replace_by_none=None, strip_accents='unicode')
etichette
#names = []
'''
хартшорн, чарльзcharles hartshorne,
 'チャールズ・ハートショーン': 'Q1064777',
 'تشارلز هارتشورن': 'Q1064777',
Esempio n. 12
0
import pandas as pd
import numpy as np
# record linkage and preprocessing
import recordlinkage as rl
from recordlinkage.preprocessing import clean, phonetic

#inventors = pd.read_csv('../Data/patentsview_inventors.csv')
assignees = pd.read_csv('../Data/patentsview_assignee_org_names.csv')
organizations = pd.read_csv('../Data/fedreporter_org_names_grants.csv')


#Do cleaning for assignee
assignees.head()
assignees['organizations_clean'] = clean(assignees['organization'])
assignees['organization'] = assignees['organizations_clean'].str.replace(' ','')
assignees["organization_phonetic"] = phonetic(assignees["organization"], method="nysiis")
assignees  = assignees.loc[assignees.org_country == "US",:]

#Do cleaning for organization
organizations[' ORGANIZATION_NAME_CLEAN'] = clean(organizations[' ORGANIZATION_NAME'])
organizations['organization'] = organizations[' ORGANIZATION_NAME_CLEAN'].str.replace(' ','')
organizations['organization_phonetic'] = phonetic(organizations["organization"], method="nysiis")
organizations  = organizations.loc[organizations[' ORGANIZATION_COUNTRY'] == "UNITED STATES",:]


print(assignees.shape)
print(assignees.patent_id.drop_duplicates().shape)

#Explore stuff
assignees.head()
organizations.head()
Esempio n. 13
0
                                                              '',
                                                              regex=True)
    forbes_pre["Company"].str.count(r'\bhldg\b').sum()
    forbes_pre['Company'] = forbes_pre["Company"].str.replace(r'\bhldg\b',
                                                              '',
                                                              regex=True)
    forbes_pre["Company"].str.count(r'\bpcl\b').sum()
    forbes_pre['Company'] = forbes_pre["Company"].str.replace(r'\bplc\b',
                                                              '',
                                                              regex=True)
    forbes_pre["Company"].str.count(r'\bthe\b').sum()
    forbes_pre['Company'] = forbes_pre["Company"].str.replace(r'\bthe\b',
                                                              '',
                                                              regex=True)

    sp500_pre['Security_sort'] = phonetic(sp500_pre['Security'],
                                          method="soundex")
    forbes_pre['Company_sort'] = phonetic(forbes_pre['Company'],
                                          method="soundex")

    len(forbes_pre[forbes_pre['Industry'].isna()
                   | forbes_pre['Sector'].isna()])

    # FULL
    indexer = rl.Index()
    indexer.full()
    candidates = indexer.index(forbes_pre, sp500_pre)
    print(len(candidates))

    # SORTED NEIGHBORHOOD
    indexer = rl.Index()
    indexer.sortedneighbourhood(left_on="Security_sort",