Esempio n. 1
0
def compute_all(A, B, idA, idB, keyA, keyB, merged, iters):
    ke = []
    jkws = []
    jk2g = []
    jws = []
    j2g = []
    for _ in range(iters):
        start = time.time()
        ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-1)
        ke.append(time.time() - start)

        start = time.time()
        ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-1)
        jkws.append(time.time() - start)

        start = time.time()
        ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-1)
        jk2g.append(time.time() - start)

        start = time.time()
        ssj.jaccard_join(A, B, idA, idB, merged, merged, WS, 0.1, n_jobs=-1)
        jws.append(time.time() - start)

        start = time.time()
        ssj.jaccard_join(A,
                         B,
                         idA,
                         idB,
                         merged,
                         merged,
                         TWO_GRAM,
                         0.1,
                         n_jobs=-1)
        j2g.append(time.time() - start)
    return ke, jkws, jk2g, jws, j2g
Esempio n. 2
0
 def apply_tables(self, ltable, rtable, l_key_attr, r_key_attr,              
                  l_match_attr, r_match_attr, n_jobs=1):                     
     if self.sim_measure_type == 'JACCARD':                                  
         return ssj.jaccard_join(ltable, rtable, l_key_attr, r_key_attr,     
                                 l_match_attr, r_match_attr, self.tokenizer, 
                                 self.threshold, comp_op=self.comp_op,       
                                 n_jobs=n_jobs)                              
     elif self.sim_measure_type == 'COSINE':                                 
         return ssj.cosine_join(ltable, rtable, l_key_attr, r_key_attr,      
                                l_match_attr, r_match_attr, self.tokenizer,  
                                self.threshold, comp_op=self.comp_op,        
                                n_jobs=n_jobs)                               
     elif self.sim_measure_type == 'DICE':                                   
         return ssj.dice_join(ltable, rtable, l_key_attr, r_key_attr,        
                              l_match_attr, r_match_attr, self.tokenizer,    
                              self.threshold, comp_op=self.comp_op,          
                              n_jobs=n_jobs)                                 
     elif self.sim_measure_type == 'EDIT_DISTANCE':                          
         return ssj.edit_distance_join(ltable, rtable,                       
                                       l_key_attr, r_key_attr,               
                                       l_match_attr, r_match_attr,           
                                       self.threshold, comp_op=self.comp_op, 
                                       n_jobs=n_jobs)                        
     elif self.sim_measure_type == 'OVERLAP':                                
         return ssj.overlap_join(ltable, rtable, l_key_attr, r_key_attr,     
                                 l_match_attr, r_match_attr, self.tokenizer, 
                                 self.threshold, comp_op=self.comp_op,       
                                 n_jobs=n_jobs)                              
     elif self.sim_measure_type == 'OVERLAP_COEFFICIENT':                    
         return ssj.overlap_coefficient_join(ltable, rtable,                 
                                 l_key_attr, r_key_attr,                     
                                 l_match_attr, r_match_attr, self.tokenizer, 
                                 self.threshold, comp_op=self.comp_op,       
                                 n_jobs=n_jobs)  
Esempio n. 3
0
    def filter_person(self, df, lim):
        A = df.ENHANCEDPERSONS.dropna().apply(lambda x: list(
            set([y.split(',')[0].lower() for y in x.split(';')])))
        A = A.apply(pd.Series).stack()
        A.index = A.index.map(lambda i: "{}_{}".format(i[0], i[1]))
        A = A.reset_index()
        A.columns = ['id', 'name']

        B = self.fetch_person()
        B.name = B.name.str.replace(r'_+', ' ').str.lower()

        qg3_tok = sm.QgramTokenizer(qval=3)
        C = ssj.jaccard_join(A,
                             B,
                             'id',
                             'id',
                             'name',
                             'name',
                             qg3_tok,
                             lim,
                             l_out_attrs=['name'],
                             r_out_attrs=['name'],
                             show_progress=False)

        return set(C.l_id.apply(lambda x: int(x.split("_")[0])))
Esempio n. 4
0
 def setUp(self):
     self.df = read_data(path_big_ten)
     self.trigramtok = sm.QgramTokenizer(qval=3)
     self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id',
                                           'name', 'name', self.trigramtok,
                                           0.3)
     self.jaccsim = sm.Jaccard()
     self.sim_scores = get_sim_scores(self.df, self.blocked_pairs,
                                      self.trigramtok, self.jaccsim)
 def setUp(self):
     self.df = read_data(path_big_ten)
     self.trigramtok = sm.QgramTokenizer(qval=3)
     self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id',
                                           'name', 'name', self.trigramtok,
                                           0.3)
     self.jaccsim = sm.Jaccard()
     self.sim_scores = get_sim_scores(self.df, self.blocked_pairs,
                                      self.trigramtok, self.jaccsim)
     self.sim_matrix = get_sim_matrix(self.df, self.sim_scores)
     self.aggcl = AgglomerativeClustering(n_clusters=5,
                                          affinity='precomputed',
                                          linkage='complete')
     self.labels = self.aggcl.fit_predict(self.sim_matrix)
Esempio n. 6
0
def get_similar_strings(table_path):
    """
    Get list of strings to be normalized by value normalizer.
    The current algorithm is as follows:
        1. Deduplicate the given list of strings.
        2. Apply string similar join.
        3. Retrive the top N similar strings as returned by string similar join.
    Arguments:
        table_path: The absolute path of list of strings.
    Returns:
        similar_strings: Similar strings as returned by the aforementioned algorithm.

    Note: This logic can be changed to improve the value normalizer part of the
          overall application.
    """
    A = pd.read_csv(table_path)
    B = pd.read_csv(table_path)
    qg3_tok = sm.QgramTokenizer(qval=3)
    output_pairs = ssj.jaccard_join(A,
                                    B,
                                    'id',
                                    'id',
                                    'foo',
                                    'foo',
                                    qg3_tok,
                                    0.6,
                                    l_out_attrs=['foo'],
                                    r_out_attrs=['foo'])
    considered_pairs = []
    similar_strings = []
    for index, row in output_pairs.iterrows():
        if row['_sim_score'] > 0.6 and row['_sim_score'] < 1.0:
            if row['l_foo'] not in similar_strings:
                similar_strings.append(row['l_foo'])
            if row['r_foo'] not in similar_strings:
                similar_strings.append(row['r_foo'])
            if len(similar_strings) >= 21:
                break
    similar_strings.sort()
    return similar_strings
    def jaccard_similarity(self, dataset, threshold):

        df = add_key_reindex(dataset)
        # concatenate all columns and convert as one string
        # for each row with '*' as separator

        A = dataset.applymap(str)

        A = A.apply(lambda x: '*'.join(x.values.tolist()), axis=1)

        A = A.astype(str)

        A = A.str.replace(" ", "")

        df['row'] = A

        ssj.profile_table_for_join(df)

        ws = sm.WhitespaceTokenizer(return_set=True)

        # auto join
        output_pairs = ssj.jaccard_join(df, df, 'New_ID',
                                        'New_ID', 'row', 'row', ws,
                                        threshold, l_out_attrs=['row'],
                                        r_out_attrs=['row'], n_jobs=-1)

        dup = output_pairs[output_pairs['l_New_ID']
                           != output_pairs['r_New_ID']]

        dataset = df[~df['New_ID'].isin(dup['r_New_ID'])]

        dataset.drop(["New_ID", "row"], axis=1, inplace=True)

        print("Number of duplicate rows removed:", len(set(dup['r_New_ID'])))

        return dataset
Esempio n. 8
0
def compute_joins(A, B, idA, idB, keyA, keyB, merged, compute = 5):
    if compute == 5:
        results = {'key_edit': ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-5),
                   'key_jac_ws': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-5),
                   'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5),
                   'mer_jac_ws': ssj.jaccard_join(A, B, idA, idB, merged, merged, WS, 0.1, n_jobs=-5),
                   'mer_jac_2g': ssj.jaccard_join(A, B, idA, idB, merged, merged, TWO_GRAM, 0.1, n_jobs=-5),
        }
    if compute == 3:
         results = {'key_edit': ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-5),
                   'key_jac_ws': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-5),
                   'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5),
                   'mer_jac_ws': ssj.jaccard_join(A, B, idA, idB, merged, merged, WS, 0.1, n_jobs=-5),
                   'mer_jac_2g': ssj.jaccard_join(A, B, idA, idB, merged, merged, TWO_GRAM, 0.1, n_jobs=-5),
        }  
            
    if compute == 2:
         results = {
                   'key_jac_ws': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-5),
                   'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5),
        }    
            
    if compute == 1:
         results = {'key_edit': ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-5),
        }    
            
            
    if compute == 0:
         results = {
                   'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5),
        } 
    return results
Esempio n. 9
0
 def apply_tables(self,
                  ltable,
                  rtable,
                  l_key_attr,
                  r_key_attr,
                  l_match_attr,
                  r_match_attr,
                  n_jobs=1):
     if self.sim_measure_type == 'JACCARD':
         return ssj.jaccard_join(ltable,
                                 rtable,
                                 l_key_attr,
                                 r_key_attr,
                                 l_match_attr,
                                 r_match_attr,
                                 self.tokenizer,
                                 self.threshold,
                                 comp_op=self.comp_op,
                                 n_jobs=n_jobs)
     elif self.sim_measure_type == 'COSINE':
         return ssj.cosine_join(ltable,
                                rtable,
                                l_key_attr,
                                r_key_attr,
                                l_match_attr,
                                r_match_attr,
                                self.tokenizer,
                                self.threshold,
                                comp_op=self.comp_op,
                                n_jobs=n_jobs)
     elif self.sim_measure_type == 'DICE':
         return ssj.dice_join(ltable,
                              rtable,
                              l_key_attr,
                              r_key_attr,
                              l_match_attr,
                              r_match_attr,
                              self.tokenizer,
                              self.threshold,
                              comp_op=self.comp_op,
                              n_jobs=n_jobs)
     elif self.sim_measure_type == 'EDIT_DISTANCE':
         return ssj.edit_distance_join(ltable,
                                       rtable,
                                       l_key_attr,
                                       r_key_attr,
                                       l_match_attr,
                                       r_match_attr,
                                       self.threshold,
                                       comp_op=self.comp_op,
                                       n_jobs=n_jobs)
     elif self.sim_measure_type == 'OVERLAP':
         return ssj.overlap_join(ltable,
                                 rtable,
                                 l_key_attr,
                                 r_key_attr,
                                 l_match_attr,
                                 r_match_attr,
                                 self.tokenizer,
                                 self.threshold,
                                 comp_op=self.comp_op,
                                 n_jobs=n_jobs)
     elif self.sim_measure_type == 'OVERLAP_COEFFICIENT':
         return ssj.overlap_coefficient_join(ltable,
                                             rtable,
                                             l_key_attr,
                                             r_key_attr,
                                             l_match_attr,
                                             r_match_attr,
                                             self.tokenizer,
                                             self.threshold,
                                             comp_op=self.comp_op,
                                             n_jobs=n_jobs)
Esempio n. 10
0
def time_all(iters=25):
    timings = defaultdict(list)
    join_types = [
        'key_edit', 'key_jac_ws', 'key_jac_2g', 'mer_jac_ws', 'mer_jac_2g'
    ]
    data_base = '/lfs/1/sahaana/enrichment/data'

    # Fuzzy Join
    A = data_base + "/main_fuzzy/test_tableA_processed.pkl"
    B = data_base + "/main_fuzzy/test_tableB_processed.pkl"
    supervision = data_base + "/main_fuzzy/supervision_test.pkl"
    idA = 'FUZZY_ID'
    idB = 'tconst'
    keyA = 'primaryTitle'
    keyB = 'primaryTitle'
    merged = 'merged_all'

    A = pd.read_pickle(A).reset_index()
    B = pd.read_pickle(B).reset_index()
    supervision = pd.read_pickle(supervision)
    supervision = supervision.set_index(idA)

    results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters)
    print('FJ')
    for j in range(len(join_types)):
        timings['data'].append('main_fuzzy')
        timings['type'].append(join_types[j])
        timings['results'].append(results[j])
        timings['avg'].append(np.mean(results[j]))
        print(timings['data'][-1], timings['type'][-1], timings['results'][-1],
              timings['avg'][-1])
    print()

    # IMDB_wiki
    A = data_base + "/imdb_wiki/dev_tableA_processed.pkl"
    B = data_base + "/imdb_wiki/dev_tableB_processed.pkl"
    supervision = data_base + "/imdb_wiki/supervision_test.pkl"
    idA = 'IMDB_ID'
    idB = 'QID'
    keyA = 'primaryTitle'
    keyB = 'title'
    merged = 'merged_all'

    A = pd.read_pickle(A).reset_index()
    B = pd.read_pickle(B).reset_index()
    supervision = pd.read_pickle(supervision)
    supervision = supervision.set_index(idA)

    results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters)
    print('imdb_wiki')
    for j in range(len(join_types)):
        timings['data'].append('imdb_wiki')
        timings['type'].append(join_types[j])
        timings['results'].append(results[j])
        timings['avg'].append(np.mean(results[j]))
        print(timings['data'][-1], timings['type'][-1], timings['results'][-1],
              timings['avg'][-1])
    print()

    # SQUAD
    A = data_base + "/SQuAD/dev_tableA_processed.pkl"
    B = data_base + "/SQuAD/dev_tableB_sent_processed.pkl"
    supervision = data_base + "/SQuAD/dev_sent_labels.pkl"
    idA = 'QID'
    idB = 'SID'
    keyA = 'question'
    keyB = 'sentence'
    merged = 'merged_all'

    A = pd.read_pickle(A).reset_index()
    B = pd.read_pickle(B).reset_index()
    supervision = pd.read_pickle(supervision)
    supervision = supervision.set_index(idA)

    results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters)
    print('SQUAD')
    for j in range(len(join_types)):
        timings['data'].append('squad')
        timings['type'].append(join_types[j])
        timings['results'].append(results[j])
        timings['avg'].append(np.mean(results[j]))
        print(timings['data'][-1], timings['type'][-1], timings['results'][-1],
              timings['avg'][-1])
    print()

    # Deepmatchers
    dm_data = {
        0: "joined_abt_buy_exp_data",
        1: "joined_amazon_google_exp_data",
        2: "joined_beer_exp_data",
        4: "joined_dblp_acm_exp_data",
        5: "joined_dblp_scholar_exp_data",
        6: "joined_dirty_dblp_acm_exp_data",
        7: "joined_dirty_dblp_scholar_exp_data",
        8: "joined_dirty_itunes_amazon_exp_data",
        9: "joined_dirty_walmart_amazon_exp_data",
        10: "joined_fodors_zagat_exp_data",
        11: "joined_itunes_amazon_exp_data",
        12: "joined_walmart_amazon_exp_data",
        30: "joined_company_exp_data"
    }

    dm_keys = {
        0: "name",
        1: "title",
        2: "Beer_Name",
        4: "title",
        5: "title",
        6: "title",
        7: "title",
        8: "Song_Name",
        9: "title",
        10: "name",
        11: "Song_Name",
        12: "title",
        30: "content"
    }
    for i in dm_data:
        print()
        print(dm_data[i])
        A = data_base + f"/dm_blocked/{dm_data[i]}/tableA_processed.pkl"
        B = data_base + f"/dm_blocked/{dm_data[i]}/tableB_processed.pkl"
        supervision = data_base + f"/dm_blocked/{dm_data[i]}/supervision_test.pkl"
        idA = 'id'
        idB = 'id'
        keyA = dm_keys[i]
        keyB = dm_keys[i]
        merged = 'merged_all'

        A = pd.read_pickle(A).reset_index()
        B = pd.read_pickle(B).reset_index()
        supervision = pd.read_pickle(supervision)
        supervision = supervision.set_index('ltable_id')
        results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters)
        print(dm_data[i])
        for j in range(len(join_types)):
            timings['data'].append(dm_data[i])
            timings['type'].append(join_types[j])
            timings['results'].append(results[j])
            timings['avg'].append(np.mean(results[j]))
            print(timings['data'][-1], timings['type'][-1],
                  timings['results'][-1], timings['avg'][-1])
        print()

    # MS Marco
    data_base = '/lfs/1/sahaana/enrichment/data/MSMARCO/'
    A = pd.read_pickle(data_base + 'dev_tableA_processed.pkl')
    B = pd.read_pickle(data_base + 'tableB_processed.pkl')
    supervision = pd.read_pickle(data_base +
                                 'supervision_test.pkl').set_index('QID')
    tk = pd.read_csv(data_base + 'top1000.dev',
                     sep='\t',
                     names=['QID', 'PID', 'Query', 'Passage'])
    grouped_tk = tk.groupby('QID').agg(list)
    grouped_tk['Query'] = grouped_tk['Query'].apply(lambda x: x[0])
    grouped_tk = grouped_tk.reset_index()
    ke = []
    jkws = []
    jk2g = []
    jws = []
    j2g = []

    for _ in range(iters):
        tt = 0
        for idx, row in grouped_tk.iterrows():
            df_l = pd.DataFrame({'QID': [row.QID], 'Query': [row.Query]})
            df_r = pd.DataFrame({'PID': row.PID, 'Passage': row.Passage})
            start = time.time()
            scores = ssj.edit_distance_join(df_l,
                                            df_r,
                                            'QID',
                                            'PID',
                                            'Query',
                                            'Passage',
                                            100,
                                            l_out_attrs=['Query'],
                                            r_out_attrs=['Passage'],
                                            n_jobs=-3)
            tt += time.time() - start
        ke.append(tt)

        tt = 0
        for idx, row in grouped_tk.iterrows():
            df_l = pd.DataFrame({'QID': [row.QID], 'Query': [row.Query]})
            df_r = pd.DataFrame({'PID': row.PID, 'Passage': row.Passage})
            start = time.time()
            scores = ssj.jaccard_join(df_l,
                                      df_r,
                                      'QID',
                                      'PID',
                                      'Query',
                                      'Passage',
                                      WS,
                                      0.05,
                                      l_out_attrs=['Query'],
                                      r_out_attrs=['Passage'],
                                      n_jobs=-3)
            tt += time.time() - start
        jkws.append(tt)
        jws.append(jkws[-1])

        tt = 0
        for idx, row in grouped_tk.iterrows():
            df_l = pd.DataFrame({'QID': [row.QID], 'Query': [row.Query]})
            df_r = pd.DataFrame({'PID': row.PID, 'Passage': row.Passage})
            start = time.time()
            scores = ssj.jaccard_join(df_l,
                                      df_r,
                                      'QID',
                                      'PID',
                                      'Query',
                                      'Passage',
                                      TWO_GRAM,
                                      0.05,
                                      l_out_attrs=['Query'],
                                      r_out_attrs=['Passage'],
                                      n_jobs=-3)
            tt += time.time() - start
        jk2g.append(tt)
        j2g.append(jk2g[-1])

    results = (ke, jkws, jk2g, jws, j2g)
    print('marco')
    for j in range(len(join_types)):
        timings['data'].append('marco')
        timings['type'].append(join_types[j])
        timings['results'].append(results[j])
        timings['avg'].append(np.mean(results[j]))
        print(timings['data'][-1], timings['type'][-1], timings['results'][-1],
              timings['avg'][-1])
    print()

    timings = pd.DataFrame(timings)
    timings.to_pickle(
        f'/lfs/1/sahaana/enrichment/ember/postprocessing/simjoin-iters_{iters}.pkl'
    )