Exemple #1
0
def linkDB(df1, df2, type, classifier):

    # 1 - INDEXING

    indexer = recordlinkage.Index()

    if type == "sortedneighbourhood":
        indexer.sortedneighbourhood(left_on="0_restaurant",
                                    right_on="1_restaurant")
    elif type == "full":
        indexer.full()
    elif type == "block":
        indexer.block(left_on="0_addressGoogle", right_on="1_addressGoogle")

    candidate_links = indexer.index(df1, df2)

    test_pairs = candidate_links[0:100]

    #https://recordlinkage.readthedocs.io/en/latest/annotation.html
    """
	df1.columns = df1.columns.str.replace(r'0_', '')
	df2.columns = df2.columns.str.replace(r'1_', '')
	
	recordlinkage.write_annotation_file(
		"check_matches.json", candidate_links[0:100], df1, df2, dataset_a_name="firstDF", dataset_b_name="secondDF")
	
	df1 = df1.add_prefix('0_')
	df2 = df2.add_prefix('1_')
	"""

    annotations = recordlinkage.read_annotation_file('result.json')

    # 2 - COMPARISON
    comp = recordlinkage.Compare()
    comp.string('0_restaurant',
                '1_restaurant',
                threshold=0.95,
                method='jarowinkler',
                label='ristorante')
    comp.string('0_neighborhood',
                '1_neighborhood',
                method='jarowinkler',
                threshold=0.85,
                label='quartiere')
    comp.exact('0_addressGoogle', '1_addressGoogle', label='indirizzoGoogle')

    features = comp.compute(candidate_links, df1, df2)
    test_features = comp.compute(test_pairs, df1, df2)

    # 3 - CLASSIFICATION
    # https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html#unsupervised

    matches = []
    drop1 = []
    drop2 = []

    if classifier == "ecm":
        ecm = recordlinkage.ECMClassifier(init='jaro',
                                          binarize=None,
                                          max_iter=100,
                                          atol=0.0001,
                                          use_col_names=True)
        ecm.fit_predict(features, match_index=None)  # Train the classifier
        e_matches = ecm.predict(features)
        for i, j in e_matches:
            if i not in drop1:
                drop1.append(i)
            if j not in drop2:
                drop2.append(j)
            record_1 = df1.loc[i]
            record_2 = df2.loc[j]
            record = tuple(record_1) + tuple(record_2)
            matches.append(record)
    elif classifier == "kmeans":
        kmeans = recordlinkage.KMeansClassifier()
        kmeans.fit_predict(features)
        k_matches = kmeans.predict(features)
        for i, j in k_matches:
            if i not in drop1:
                drop1.append(i)
            if j not in drop2:
                drop2.append(j)
            record_1 = df1.loc[i]
            record_2 = df2.loc[j]
            record = tuple(record_1) + tuple(record_2)
            matches.append(record)

    head = tuple(df1.head()) + tuple(df2.head())
    matches_result = pd.DataFrame(matches)
    matches_result.columns = head

    df1t = df1.drop(drop1, axis=0)
    df2t = df2.drop(drop2, axis=0)
    result = df1t.append([df2t, matches_result])

    new_index = []

    for n in range(result.shape[0]):
        new_index.append(n)

    result.index = new_index

    # 4 - EVALUATION

    if classifier == "ecm":
        test_matches = ecm.predict(test_features)
        cm = recordlinkage.confusion_matrix(annotations.links,
                                            test_matches,
                                            total=100)
        acc = recordlinkage.accuracy(annotations.links,
                                     test_matches,
                                     total=100)
    elif classifier == "kmeans":
        test_matches = kmeans.fit_predict(test_features)
        cm = recordlinkage.confusion_matrix(annotations.links,
                                            test_matches,
                                            total=100)
        acc = recordlinkage.accuracy(annotations.links,
                                     test_matches,
                                     total=100)

    print(cm, acc)

    return result
def firstDFgenerator():
    fb = pd.read_csv("./data/restaurants/gbr_splitted_google/FoodBuzz.csv",
                     sep=";")
    fb_cols = [
        'restaurant', 'neighborhood', 'address', 'addressGoogle', 'country',
        'country_code', 'type', 'cost'
    ]
    fb = fb[fb_cols]

    nym = pd.read_csv("./data/restaurants/gbr_splitted_google/NYMag.csv",
                      sep=";")
    nym.rename(columns={"address": "neighborhood"}, inplace=True)

    nyt = pd.read_csv("./data/restaurants/gbr_splitted_google/NYTimes.csv",
                      sep=";")

    ot = pd.read_csv("./data/restaurants/gbr_splitted_google/OpenTable.csv",
                     sep=";")

    sc = pd.read_csv("./data/restaurants/gbr_splitted_google/SavoryCities.csv",
                     sep=";")
    sc_cols = ['restaurant', 'neighborhood', 'address', 'addressGoogle']
    sc = sc[sc_cols]

    df = fb.append([nym, nyt, ot, sc])

    df.drop_duplicates(subset=["restaurant", "neighborhood"],
                       keep="first",
                       inplace=True)

    lenght = df.shape[0]
    new_index = []

    for i in range(lenght):
        new_index.append(i)

    df.index = new_index

    # DEDUPLICATION

    indexer = recordlinkage.Index()
    indexer.sortedneighbourhood(left_on=['restaurant'])
    candidate_links = indexer.index(df)

    compare_cl = recordlinkage.Compare()
    compare_cl.string('restaurant',
                      'restaurant',
                      method='jarowinkler',
                      label='ristorante')
    compare_cl.string('neighborhood',
                      'neighborhood',
                      method='jarowinkler',
                      label='quartiere')

    features = compare_cl.compute(candidate_links, df)

    potential_matches = features[features.sum(axis=1) > 1.75].reset_index()
    potential_matches[
        'Score'] = potential_matches.loc[:, ['ristorante', 'quartiere']].sum(
            axis=1)

    # MATCHES
    account_merge = potential_matches.merge(df,
                                            left_on="level_0",
                                            right_index=True)
    final_merge = account_merge.merge(df, left_on="level_1", right_index=True)
    final_merge.dropna(axis='columns', inplace=True,
                       how='all')  # rimuovo solo colonne tutte NaN
    final_merge.drop(columns=['ristorante', 'quartiere', 'Score'],
                     inplace=True)

    final_list = []
    all_indexes = []
    for i, row in final_merge.iterrows():
        all_indexes.append(row.level_0)
        all_indexes.append(row.level_1)

        restaurant = row.restaurant_x
        neighborhood = row.neighborhood_x  # da controllare

        if row.addressGoogle_x == row.addressGoogle_y:
            addressGoogle = str(row.addressGoogle_x)
        elif (row.addressGoogle_x != row.addressGoogle_x):  # è NaN
            addressGoogle = str(row.addressGoogle_y)
        else:
            addressGoogle = str(row.addressGoogle_x)

        address = row.address_y
        country = row.country_y
        type_r = row.type_y
        cost = row.cost_y
        country_code = row.country_code_y
        new_row = (restaurant, neighborhood, address, addressGoogle, country,
                   country_code, type_r, cost)
        final_list.append(new_row)

    clean_merge = pd.DataFrame(final_list)
    clean_merge.columns = [
        "restaurant", "neighborhood", "address", "addressGoogle", "country",
        "country_code", "type_r", "cost"
    ]

    df.drop(index=all_indexes, inplace=True, axis=0)

    df = df.append(clean_merge)
    lenght = df.shape[0]
    new_index = []

    for i in range(lenght):
        new_index.append(i)

    df.index = new_index

    return df
                  str(false_pos))
            print("#True Negative: " + str(true_neg) + " #FN " +
                  str(false_neg))

        torch.save(model.state_dict(), "checkpoint.pt")
        epoch += 1


if __name__ == "__main__":
    args = parse_arguments()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    df1 = pd.read_csv(args.source1, args.separator)
    df2 = pd.read_csv(args.source2, args.separator)
    map = pd.read_csv(args.mapping, args.separator)

    indexer = recordlinkage.Index()
    indexer.sortedneighbourhood(args.blocking_attr, window=args.blocking_size)
    candidate_links = shuffle(indexer.index(df1, df2))
    train_set = candidate_links[:int(3 * len(candidate_links) / 4)]
    test_set = candidate_links[int(3 * len(candidate_links) / 4):]

    model = NLP(args.word_embed, args.word_embed_size, args.n_attrs,
                device).to(device)
    print(model)

    if args.load_model != '':
        model.load_state_dict(torch.load(args.load_model))

    train(model, df1, df2, map, train_set, test_set)
Exemple #4
0
#df2 = pd.read_excel('Textbook projectSummerFall2021-2.xlsx',sheet_name='FallBookstoreList')
df2.head()
#%%
#method 1
matched_results = fuzzymatcher.fuzzy_left_join(df1, df2, 'Title', 'Long Title')

#%%
matched_results[['best_match_score', 'Title', 'Long Title', 'Internal ID'
                 ]].to_excel('fuzzymatcherresults_min_spring.xlsx')
matched_results.to_excel('fuzzymatcherresults_full_spring.xlsx')
# %%
#method 2
import recordlinkage as rl
from recordlinkage.index import Full
# %%
indexer = rl.Index()
indexer.add(Full())

pairs = indexer.index(
    df1,
    df2,
)
print(len(pairs))

comparer = rl.Compare()
comparer.string('Title', 'Long Title', threshold=0.55, label='Title')

potential_matches = comparer.compute(pairs, df1, df2)
matches = potential_matches[potential_matches.sum(axis=1) > 0].reset_index()
#print(matches)
def secondDFgenerator():
    to = pd.read_csv("./data/restaurants/gbr_splitted_google/TimeOut.csv",
                     sep=";")
    temp_to = to['in'].map(str) + ' ' + to['address1'].map(
        str) + '-' + to['address2'].map(str) + to['address3'].map(str)
    temp_to = temp_to.str.replace("nan", "")
    to.drop(columns=['in', 'address1', 'address2', 'address3'],
            inplace=True,
            axis=1)
    to['address'] = temp_to
    to['country'] = 'New York'
    to_cols = [
        'restaurant', 'address', 'addressGoogle', 'country', 'neighborhood',
        'type', 'phone'
    ]
    to = to[to_cols]

    ad = pd.read_csv("./data/restaurants/gbr_splitted_google/ActiveDiner.csv",
                     sep=";")
    ad_cols = ['restaurant', 'address', 'addressGoogle', 'country']
    ad = ad[ad_cols]

    dg = pd.read_csv("./data/restaurants/gbr_splitted_google/DiningGuide.csv",
                     sep=";")

    mp = pd.read_csv("./data/restaurants/gbr_splitted_google/MenuPages.csv",
                     sep=";")
    temp_mp = mp['address1'].map(str) + '-' + mp['address2'].map(str)
    temp_mp = temp_mp.str.replace("nan", "")
    mp.drop(columns=['address1', 'address2'], inplace=True, axis=1)
    mp['address'] = temp_mp
    mp.rename(columns={"addressGoogle1": "addressGoogle"}, inplace=True)
    mp_cols = ['restaurant', 'address', 'addressGoogle']
    mp = mp[mp_cols]

    ts = pd.read_csv("./data/restaurants/gbr_splitted_google/TasteSpace.csv",
                     sep=";")
    ts_cols = ['restaurant', 'address', 'addressGoogle', 'country']
    ts = ts[ts_cols]

    vv = pd.read_csv("./data/restaurants/gbr_splitted_google/VillageVoice.csv",
                     sep=";")
    vv['country'] = 'New York'
    vv_cols = [
        'restaurant', 'address', 'addressGoogle', 'country', 'neighborhood'
    ]
    vv = vv[vv_cols]

    df = to.append([ad, dg, mp, ts, vv])

    df.drop_duplicates(subset=["restaurant", "addressGoogle"],
                       keep="first",
                       inplace=True)

    lenght = df.shape[0]
    new_index = []

    for i in range(lenght):
        new_index.append(i)

    df.index = new_index

    # DEDUPLICATION

    indexer = recordlinkage.Index()
    indexer.sortedneighbourhood(left_on=['restaurant'])
    candidate_links = indexer.index(df)

    compare_cl = recordlinkage.Compare()
    compare_cl.string('restaurant',
                      'restaurant',
                      method='jarowinkler',
                      label='ristorante')
    compare_cl.exact('addressGoogle', 'addressGoogle', label='indirizzo')

    features = compare_cl.compute(candidate_links, df)

    potential_matches = features[features.sum(axis=1) > 1.75].reset_index()
    potential_matches[
        'Score'] = potential_matches.loc[:, ['ristorante', 'indirizzo']].sum(
            axis=1)

    # MATCHES
    account_merge = potential_matches.merge(df,
                                            left_on="level_0",
                                            right_index=True)  #, how='outer'
    final_merge = account_merge.merge(df, left_on="level_1", right_index=True)
    final_merge.dropna(axis='columns', inplace=True,
                       how='all')  #rimuovo solo colonne tutte NaN
    final_merge.to_csv(
        './data/restaurants_integrated/output_recordlinkage/final_merge.csv',
        header=True,
        sep=";",
        decimal=',',
        float_format='%.3f',
        index=False)
    final_merge.drop(columns=['ristorante', 'indirizzo', 'Score'],
                     inplace=True)  # so che sono 1 1 2

    final_list = []
    all_indexes = []

    for i, row in final_merge.iterrows():
        all_indexes.append(row.level_0)
        all_indexes.append(row.level_1)

        restaurant = row.restaurant_x
        address = row.address_x
        country = "New York"

        if row.type_x == row.type_y:
            type_r = str(row.type_x)
        elif (row.type_x != row.type_x):
            type_r = str(row.type_y)
        else:
            type_r = str(row.type_x)

        if row.phone_x == row.phone_y:
            phone = str(row.phone_x)
        elif (row.phone_x != row.phone_x):
            phone = str(row.phone_y)
        else:
            phone = str(row.phone_x)

        if row.neighborhood_x == row.neighborhood_y:
            neighborhood = str(row.neighborhood_x)
        elif (row.neighborhood_x != row.neighborhood_x):
            neighborhood = str(row.neighborhood_y)
        else:
            neighborhood = str(row.neighborhood_x)

        if row.addressGoogle_x == row.addressGoogle_y:
            addressGoogle = str(row.addressGoogle_x)
        elif (row.addressGoogle_x != row.addressGoogle_x):  # è NaN
            addressGoogle = str(row.addressGoogle_y)
        else:
            addressGoogle = str(row.addressGoogle_x)

        new_row = (restaurant, address, addressGoogle, country, neighborhood,
                   type_r, phone)
        final_list.append(new_row)

    clean_merge = pd.DataFrame(final_list)
    clean_merge.columns = [
        "restaurant", "address", "addressGoogle", "country", "neighborhood",
        "type_r", "phone"
    ]

    df.drop(index=all_indexes, inplace=True, axis=0)

    df = df.append(clean_merge)
    lenght = df.shape[0]
    new_index = []

    for i in range(lenght):
        new_index.append(i)

    df.index = new_index

    return df
Exemple #6
0
def main():
    dfA, dfB = load_febrl4()
    indexer = recordlinkage.Index()
    indexer.full()
    pairs = indexer.index(dfA, dfB)
    return
Exemple #7
0
def record_linkage(df_protect, df_init, block_left, block_right, gen_col):
    indexer = recordlinkage.Index()
    if block_left is None:
        indexer.full()
    if block_left is not None:
        indexer.block(left_on=block_left, right_on=block_right)
    candidates = indexer.index(df_protect, df_init)
    # print(len(candidates))
    compare = recordlinkage.Compare()
    if all(df_protect[gen_col].dtypes == object):
        compare.string('height', 'height', threshold=0.9, label='height')
        compare.string('weight', 'weight', threshold=0.9, label='weight')
        # compare.string('born', 'born', threshold=0.9, label='born')
        compare.string('Year', 'Year', threshold=0.9, label='Year')
        compare.string('Age', 'Age', threshold=0.9, label='Age')
        compare.string('NBA_Years',
                       'NBA_Years',
                       threshold=0.9,
                       label='NBA_Years')
        if block_left == 'birth_city':
            compare.string('born', 'born', threshold=0.9, label='born')

    if not all(df_protect[gen_col].dtypes == object):
        compare.numeric('height', 'height', label='height')
        compare.numeric('weight', 'weight', label='weight')
        compare.numeric('Year', 'Year', label='Year')
        compare.numeric('Age', 'Age', label='Age')
        compare.numeric('NBA_Years', 'NBA_Years', label='NBA_Years')
        if block_left == 'born':
            compare.string('birth_city',
                           'birth_city',
                           threshold=0.9,
                           label='birth_city')

    if all(df_protect[['year_start', 'year_end']].dtypes == float):
        compare.numeric('year_start', 'year_start', label='year_start')
        compare.numeric('year_end', 'year_end', label='year_end')

    if not all(df_protect[['year_start', 'year_end']].dtypes == float):
        compare.string('year_start',
                       'year_start',
                       threshold=0.9,
                       label='year_start')
        compare.string('year_end', 'year_end', threshold=0.9, label='year_end')

    compare.string('college', 'college', threshold=0.9, label='college')
    compare.string('birth_state',
                   'birth_state',
                   threshold=0.9,
                   label='birth_state')
    compare.numeric('PER', 'PER', label='PER')
    compare.numeric('G', 'G', label='G')
    compare.numeric('GS', 'GS', label='GS')
    compare.numeric('MP', 'MP', label='MP')
    compare.numeric('ORB%', 'ORB%', label='ORB%')
    compare.numeric('DRB%', 'DRB%', label='DRB%')
    compare.numeric('TRB%', 'TRB%', label='TRB%')
    compare.numeric('ORB', 'ORB', label='ORB')
    compare.numeric('DRB', 'DRB', label='DRB')
    compare.numeric('TRB', 'TRB', label='TRB')
    compare.numeric('PTS', 'PTS', label='PTS')

    return compare.compute(candidates, df_protect, df_init)
Exemple #8
0
    def init(self):
        e = xml.etree.ElementTree.parse(config.CORA_XML_PATH).getroot()
        logger.info("Sample Record from CORA dataset: ")
        logger.info(xml.etree.ElementTree.tostring(e.find('NEWREFERENCE')))

        self.data = {}
        dataA = []
        dataB = []
        testA = []
        testB = []
        valA = []
        valB =[]
        db_choice = 0
        for record in e.findall('NEWREFERENCE'):
            dni = re.search('[a-z]+[0-9]+[a-z]*', record.text)
            dni = re.search('[a-z]+', record.text) if not dni else dni
            dni = dni.group() if dni else record.text

            if dni in self.data:
                self.data[dni].append(record)
            else:
                self.data[dni] = [record]

            #db_choice = random.randint(0, 3)
            if db_choice == 0 or db_choice == 4 or db_choice == 8:
                dataA.append(record)
            elif db_choice == 1 or db_choice == 5 or db_choice == 9:
                dataB.append(record)
            elif db_choice == 2 or db_choice == 6:
                testA.append(record)
            elif db_choice == 3 or db_choice == 7:
                testB.append(record)
            elif db_choice == 10:
                valA.append(record)
            elif db_choice == 11:
                valB.append(record)
            db_choice = (db_choice + 1) % 12

        logger.info("Size of Training Dataset A %d and B  %d", len(dataA), len(dataB))
        logger.info("Size of Testing Dataset A %d and B  %d", len(testA), len(testB))
        logger.info("Size of Validation Dataset A %d and B  %d", len(valA), len(valB))

        df_a = {  'dni' : [], 'author' : [], 'publisher' : [], 'date' : [],
                'title' : [], 'journal' : [], 'volume' : [], 'pages' : [], 'address' : [], 'id' : []}
        df_b = copy.deepcopy(df_a)
        tdf_a = copy.deepcopy(df_a)
        tdf_b = copy.deepcopy(df_a)
        vdf_a = copy.deepcopy(df_a)
        vdf_b = copy.deepcopy(df_a)
        for (df, dataset) in [(df_a, dataA), (df_b, dataB), (tdf_a, testA), (tdf_b, testB), (vdf_a, valA), (vdf_b, valB)]:
            for record in dataset:
                dni = re.search('[a-z]+[0-9]+[a-z]*', record.text)
                dni = re.search('[a-z]+', record.text) if not dni else dni
                df['dni'].append(dni.group() if dni else record.text)
                df['author'].append(unicode(record.find('author').text if record.find('author') is not None else '','utf-8'))
                df['title'].append(unicode(record.find('title').text if record.find('title') is not None else u''))
                df['publisher'].append(unicode(record.find('publisher').text if record.find('publisher') is not None else ''))
                df['date'].append(unicode(record.find('date').text if record.find('date') is not None else ''))
                df['journal'].append(unicode(record.find('journal').text if record.find('journal') is not None else ''))
                df['volume'].append(unicode(record.find('volume').text if record.find('volume') is not None else ''))
                df['pages'].append(unicode(record.find('pages').text if record.find('pages') is not None else ''))
                df['address'].append(unicode(record.find('address').text if record.find('address') is not None else ''))
                df['id'].append(record.get('id'))

        self.trainDataA = pd.DataFrame(data=df_a)
        self.trainDataB = pd.DataFrame(data=df_b)
        self.testDataA = pd.DataFrame(data=tdf_a)
        self.testDataB = pd.DataFrame(data=tdf_b)
        self.valDataA = pd.DataFrame(data=vdf_a)
        self.valDataB = pd.DataFrame(data=vdf_b)

        #Extract all possible pairs for training
        indexer = recordlinkage.Index()
        indexer.full()
        self.candidate_links = indexer.index(self.trainDataA, self.trainDataB)
        logger.info("No. of Candidate Pairs %d", (len(self.candidate_links)))

        #Extarct true links (takes time...)
        true_links = []
        for indexA, indexB in self.candidate_links:
            if df_a['dni'][indexA] == df_b['dni'][indexB] and df_a['dni'][indexA]:
                true_links.append((indexA, indexB))
        logger.info("Number of true links: %d", len(true_links))
        self.true_links = pd.MultiIndex.from_tuples(true_links)

        #Extract all possible pairs for test
        indexer = recordlinkage.Index()
        indexer.full()
        self.test_links = indexer.index(self.testDataA, self.testDataB)
        logger.info("Number Candidate Pairs for testing: %d", (len(self.test_links)))

        #Extarct test true links (takes time...)
        true_test_links = []
        for indexA, indexB in self.test_links:
            if tdf_a['dni'][indexA] == tdf_b['dni'][indexB]:
                true_test_links.append((indexA, indexB))
        logger.info("Number of true links for testing: %d", len(true_test_links))
        self.true_test_links = pd.MultiIndex.from_tuples(true_test_links)

        #Extract all possible pairs for Validation
        indexer = recordlinkage.Index()
        indexer.full()
        self.val_links = indexer.index(self.valDataA, self.valDataB)
        logger.info("Number Candidate Pairs for validation: %d", (len(self.val_links)))

        #Extarct test true links (takes time...)
        true_val_links = []
        for indexA, indexB in self.val_links:
            if vdf_a['dni'][indexA] == vdf_b['dni'][indexB]:
                true_val_links.append((indexA, indexB))
        logger.info("Number of true links for validation: %d", len(true_val_links))
        self.true_val_links = pd.MultiIndex.from_tuples(true_val_links)
Exemple #9
0
def incremental_match(df_n, df_o):
    indexer = rl.Index()
    indexer.block(left_on=[
        'first_name_clean', 'last_name_clean', 'zip_clean', 'state_clean'
    ])
    indexer.block(left_on=['first_name_clean', 'last_name_clean'])
    indexer.block(left_on=['zip_clean', 'first_name_clean'])
    indexer.block(left_on=['zip_clean', 'last_name_clean'])
    indexer.block(left_on=['zip_clean', 'state_clean'])
    indexer.block(left_on=['last_name_clean', 'state_clean'])
    indexer.block(left_on=['zip_clean', 'phone_clean'])
    candidate_links = indexer.index(df_n, df_o)

    compare_cl = rl.Compare(n_jobs=8)
    compare_cl.string('first_name_clean',
                      'first_name_clean',
                      method='jarowinkler',
                      threshold=0.92,
                      label='first_name_cl')
    compare_cl.string('last_name_clean',
                      'last_name_clean',
                      method='jarowinkler',
                      threshold=0.85,
                      label='last_name_cl')
    compare_cl.string('email',
                      'email',
                      method='jarowinkler',
                      threshold=0.90,
                      label='email_cl')
    compare_cl.string('address_clean',
                      'address_clean',
                      method='jarowinkler',
                      threshold=0.70,
                      label='address_cl')
    compare_cl.string('city_clean',
                      'city_clean',
                      method='jarowinkler',
                      threshold=0.85,
                      label='city_cl')
    compare_cl.string('state_clean',
                      'state_clean',
                      method='jarowinkler',
                      threshold=0.85,
                      label='state_cl')
    compare_cl.exact('zip_clean', 'zip_clean', label='zip_cl')
    compare_cl.string('phone_clean',
                      'phone_clean',
                      method='jarowinkler',
                      threshold=0.95,
                      label='phone_cl')

    # create the deduped feature set - this takes a while...
    cl_features = compare_cl.compute(candidate_links, df_n, df_o)
    cl_summary = cl_features.sum(axis=1).value_counts().sort_index(
        ascending=False)

    df_match = cl_features[cl_features.sum(axis=1) > 2].reset_index(
    )  #this limits the dataset to rows that had more than 2 matched columns
    df_match['score'] = df_match.loc[:, 'email_cl':'phone_cl'].sum(
        axis=1)  #this sums the columns and creates a "score" column

    #creates an indicator based on the threshold rule of > 2 matches (1=yes, 0=no)
    df_match['duplicate'] = np.where(
        (df_match['email_cl'] == 1) | (df_match['first_name_cl'] == 1 |
                                       (df_match['last_name_cl'] == 1)) &
        ((df_match['score'] > 3) |
         (df_match['phone_cl'] + df_match['zip_cl'] == 2)), 1, 0)
    ne = df_match[df_match['duplicate'] == 1]  #filter out non-matching rows
    ne.sort_values(by=['score'],
                   ascending=False)  #sort the results by the score column
    return ne
Exemple #10
0
def match_data_chunk(df, s):
    count_pass = 0
    start = time.process_time()
    all_matches = []
    df0 = df[df['email'].isnull(
    ) | ~df[df['email'].notnull()].duplicated(subset='email', keep=False)]
    mask = df.email.duplicated(keep=False)
    d = df[mask]
    dupe_emails = d[['customer_id', 'email']].dropna(subset=['email'])
    df1 = df0.sample(frac=s, replace=False, random_state=42)
    df1.set_index('customer_id', inplace=True)
    dupe_indexer = rl.Index()

    #set the blocking system
    dupe_indexer.block(left_on=[
        'first_name_clean', 'last_name_clean', 'zip_clean', 'state_clean'
    ])
    dupe_indexer.block(left_on=['first_name_clean', 'last_name_clean'])
    dupe_indexer.block(left_on=['zip_clean', 'first_name_clean'])
    dupe_indexer.block(left_on=['zip_clean', 'last_name_clean'])
    #dupe_indexer.block(left_on=['zip_clean', 'state_clean'])
    dupe_indexer.block(left_on=['last_name_clean', 'state_clean'])
    #dupe_indexer.block(left_on=['zip_clean', 'phone_clean'])
    dupe_candidate_links = dupe_indexer.index(df1)
    print("total candidate links:", len(dupe_candidate_links))

    # split the pair indedx for iteration
    s = rl.index_split(dupe_candidate_links, 20)
    for chunk in s:
        compare_dupes = rl.Compare(n_jobs=8)
        compare_dupes.string('first_name_clean',
                             'first_name_clean',
                             method='jarowinkler',
                             threshold=0.92,
                             label='first_name_cl')
        compare_dupes.string('last_name_clean',
                             'last_name_clean',
                             method='jarowinkler',
                             threshold=0.85,
                             label='last_name_cl')
        compare_dupes.string('email',
                             'email',
                             method='jarowinkler',
                             threshold=0.90,
                             label='email_cl')
        compare_dupes.string('address_clean',
                             'address_clean',
                             method='jarowinkler',
                             threshold=0.7,
                             label='address_cl')
        compare_dupes.string('city_clean',
                             'city_clean',
                             method='jarowinkler',
                             threshold=0.85,
                             label='city_cl')
        compare_dupes.string('state_clean',
                             'state_clean',
                             method='jarowinkler',
                             threshold=0.85,
                             label='state_cl')
        compare_dupes.exact('zip_clean', 'zip_clean', label='zip_cl')
        compare_dupes.string('phone_clean',
                             'phone_clean',
                             method='jarowinkler',
                             threshold=0.95,
                             label='phone_cl')

        # create the deduped feature set - this takes a while...
        dupe_features = compare_dupes.compute(chunk, df1)

        # select those features that match

        # Business rule: of any 3 of email, address, city, state, zip, or phone match, then code it as a "duplicate"

        pdm = dupe_features[dupe_features.sum(axis=1) > 2].reset_index(
        )  #this limits the dataset to rows that had more than 2 matched columns
        pdm['score'] = pdm.loc[:, 'email_cl':'phone_cl'].sum(
            axis=1)  #this sums the columns and creates a "score" column
        pdm['duplicate'] = np.where(
            (pdm['email_cl'] == 1) | (pdm['first_name_cl'] == 1 |
                                      (pdm['last_name_cl'] == 1)) &
            ((pdm['score'] > 3) | (pdm['phone_cl'] + pdm['zip_cl'] == 2)), 1, 0
        )  #creates an indicator based on the threshold rule of > 2 matches (1=yes, 0=no)

        ne = pdm[pdm['duplicate'] == 1]  #filter out non-matching rows
        ne.sort_values(by=['score'],
                       ascending=False)  #sort the results by the score column

        matches = ne[[
            'customer_id_1', 'customer_id_2', 'email_cl', 'phone_cl',
            'address_cl', 'city_cl', 'state_cl', 'zip_cl', 'score', 'duplicate'
        ]].sort_values(by=['score'], ascending=False)
        all_matches.append(matches)
        count_pass = count_pass + 1
        elapsed = time.process_time() - start
        print(count_pass, format(elapsed, '.3f'), ": seconds")
    all_matches = pd.concat(all_matches, ignore_index=True)

    return df1, all_matches, dupe_emails
Exemple #11
0
def eval_lightgbm_dedupe(dataset_type: str, dataset_rf, dataset_dedupe):
    # Create deduper model
    with open(
            '../../trained_models/combined_2/trained_{}_settings.json'.format(
                dataset_type), 'rb') as fin:
        deduper = dedupe.StaticDedupe(fin, num_cores=NUM_CORES)

    cols = ['name', 'brand', 'size', 'product_type']

    to_dedupe = dataset_dedupe[cols]
    to_dedupe_dict = to_dedupe.to_dict(orient='index')

    # Cluster (prediction stage)
    clustered_dupes = deduper.partition(to_dedupe_dict,
                                        partition_threshold[dataset_type])
    print('# duplicate sets', len(clustered_dupes))

    # Create the record linkage model
    # Indexer
    indexer = rl.Index()
    indexer.add(rl.index.Block('product_type'))
    indexer.add(rl.index.Block('brand'))
    indexer.add(rl.index.Block('size'))
    candidate_links = indexer.index(dataset_rf)

    # Comparing
    compare_cl = rl.Compare(n_jobs=-1)
    compare_cl.exact('brand', 'brand')
    compare_cl.exact('product_type', 'product_type')
    compare_cl.exact('size', 'size')
    compare_cl.string('name', 'name', method='qgram')
    compare_cl.string('name', 'name', method='damerau_levenshtein')
    compare_cl.string('name', 'name', method='levenshtein')
    compare_cl.string('name', 'name', method='jarowinkler')
    compare_cl.string('name', 'name', method='smith_waterman')
    compare_cl.string('name', 'name', method='lcs')
    # compare_cl.string('price', 'price')

    # Features
    features = compare_cl.compute(candidate_links, dataset_rf)

    # Add dedupe features
    features['xy_same_entity'] = pd.Series(np.zeros(len(features)))
    features.xy_same_entity = 0.0

    # Save the result
    for el in clustered_dupes:
        for i in range(len(el[0])):
            for j in range(i + 1, len(el[0])):
                k = (el[0][i], el[0][j])
                r_k = (el[0][j], el[0][i])
                p = el[1][i] * el[1][j]

                if k in features.index:
                    features.loc[k, 'xy_same_entity'] = p

                if r_k in features.index:
                    features.loc[r_k, 'xy_same_entity'] = p

    # Now load the lightgbm
    bst = lgb.Booster(
        model_file='../../trained_models/combined_2/x4_lgb_classifier.txt')

    # Predict
    confs = bst.predict(features)
    features['label'] = confs

    # Save the csv file
    # Now export the left and right instance ids
    # Save the result
    res = []
    for i in range(len(confs)):
        record = features.iloc[i]
        label = record.label
        if label > 0.5:
            res.append(record.name)

    res_df = pd.DataFrame(res)
    res_df.columns = ['left_instance_id', 'right_instance_id']

    return res_df
Exemple #12
0
def detect_duplicates(df_in):
    '''
    Function used to detected and drop duplicates due to fuzzy data (e.g. due to typos) in a df_patient dataframe.
    
    Parameters
    ----------    
    df_in : Dataframe
        df_patient dataframe to clean from duplicates 
        We assume that the input dataframe has the following columns: 
        given_name, surname, street_number, address_1, suburb, postcode, state, date_of_birth, age, phone_number, address_2
        
    Returns
    -------
    df : Dataframe
        The clean df_patient dataframe with the duplicates rows removed 
    '''

    # Copy the the input Dataframe
    df = df_in.copy()

    # Indexation step
    # The index is built from some columns to make possible pairs of duplicates
    # We choose the columns:
    #   - address_1: because it has potentially the more characters
    #   - phone_number: it has only digits
    #   - surname
    dupe_indexer = recordlinkage.Index()
    dupe_indexer.sortedneighbourhood(left_on='address_1', window=3)
    dupe_indexer.sortedneighbourhood(left_on='phone_number', window=3)
    dupe_indexer.sortedneighbourhood(left_on='surname', window=3)

    dupe_candidate_links = dupe_indexer.index(df)

    # Comparison step
    compare_dupes = recordlinkage.Compare()

    # We added below all the columns that we want to compare for duplicates (all of them except patient_id)
    compare_dupes.string('given_name',
                         'given_name',
                         method='jarowinkler',
                         threshold=0.85,
                         label='given_name')
    compare_dupes.string('surname',
                         'surname',
                         method='jarowinkler',
                         threshold=0.85,
                         label='surname')
    compare_dupes.exact('street_number',
                        'street_number',
                        label='street_number')
    compare_dupes.string('address_1',
                         'address_1',
                         threshold=0.85,
                         label='address_1')
    compare_dupes.string('suburb', 'suburb', threshold=0.85, label='suburb')
    compare_dupes.exact('postcode', 'postcode', label='postcode')
    compare_dupes.exact('state', 'state', label='state')
    compare_dupes.exact('date_of_birth',
                        'date_of_birth',
                        label='date_of_birth')
    compare_dupes.exact('age', 'age', label='age')
    compare_dupes.string('phone_number',
                         'phone_number',
                         threshold=0.85,
                         label='phone_number')
    compare_dupes.exact('address_2', 'address_2', label='address_2')

    dupes_features = compare_dupes.compute(dupe_candidate_links, df)
    # dupes_features is a dataframe with 2 row index that represent a pair of row (row 1, row2),
    # and the same number of column as df. each value is a score 0. or 1.
    # 0 for one column means the row1 and row2 values for this column are not duplicates
    # 1 means duplicates

    # We setup the threshold score for the potential dupes rows at 5.0: If 5 or more columns values are duplicates then full row is duplicate
    # Items with score > 4 are duplicates
    # potential_dupes contains all the row pairs above the threshold plus and additional score column
    potential_dupes = dupes_features[dupes_features.sum(
        axis=1) > 4].reset_index()
    potential_dupes[
        'Score'] = potential_dupes.loc[:, 'given_name':'address_2'].sum(axis=1)

    # As we ran reset_index() potential_dupes df has now additionnal columns level_0 and level_1
    # Level_1 column contains the index of the duplicates to
    # remove. We do that with the instruction below.
    df = df.drop(axis=0,
                 index=potential_dupes['level_1']).reset_index(drop=True)

    # compute percentage, then print percentage and number of rows removed
    print("Percentage of duplicate rows: ",
          100 * (len(df_in) - len(df)) / len(df_in))
    print("Number of rows removed: ", len(df_in) - len(df))

    return df