Beispiel #1
0
def merge_candsets(candset_list,
                   candset_l_key_attr,
                   candset_r_key_attr,
                   num_trees,
                   vote_col='votes'):
    print len(candset_list), candset_l_key_attr, candset_r_key_attr
    vote_cnt = {}
    for candset in candset_list:
        # Find indices of l_key_attr and r_key_attr in candset
        candset_columns = list(candset.columns.values)
        candset_l_key_attr_index = candset_columns.index(candset_l_key_attr)
        candset_r_key_attr_index = candset_columns.index(candset_r_key_attr)
        dataframe_column_to_str(candset, candset_l_key_attr, inplace=True)
        dataframe_column_to_str(candset, candset_r_key_attr, inplace=True)
        for candset_row in candset.itertuples(index=False):
            pair_id = str(candset_row[candset_l_key_attr_index]) + ',' + str(
                candset_row[candset_r_key_attr_index])
            curr_votes = vote_cnt.get(pair_id, 0)
            vote_cnt[pair_id] = curr_votes + 1
    output_rows = []
    for pair_id, votes in iteritems(vote_cnt):
        if votes >= (num_trees / 2.0):
            fields = pair_id.split(',')
            output_rows.append([fields[0], fields[1], votes])
    return pd.DataFrame(output_rows, columns=['l_id', 'r_id', vote_col])
    def test_candset_with_join_attr_of_type_int(self):
        A = pd.DataFrame([{'l_id': 1, 'l_attr':1990},
                          {'l_id': 2, 'l_attr':2000},
                          {'l_id': 3, 'l_attr':0},
                          {'l_id': 4, 'l_attr':-1},
                          {'l_id': 5, 'l_attr':1986}])
        B = pd.DataFrame([{'r_id': 1, 'r_attr':2001},
                          {'r_id': 2, 'r_attr':1992},
                          {'r_id': 3, 'r_attr':1886},
                          {'r_id': 4, 'r_attr':2007},
                          {'r_id': 5, 'r_attr':2012}])

        dataframe_column_to_str(A, 'l_attr', inplace=True)              
        dataframe_column_to_str(B, 'r_attr', inplace=True)  

        A['tmp_join_key'] = 1
        B['tmp_join_key'] = 1
        C = pd.merge(A[['l_id', 'tmp_join_key']],
                     B[['r_id', 'tmp_join_key']],
                 on='tmp_join_key').drop('tmp_join_key', 1)

        qg2_tok = QgramTokenizer(2, return_set=True)
        expected_pairs = set(['1,2', '1,3', '2,1', '2,4', '2,5',
                              '4,1', '5,2', '5,3'])
        self.test_filter_candset(qg2_tok, 1, '>=', False,
                                 (C, 'l_id', 'r_id',
                                  A, B, 'l_id', 'r_id',
                                  'l_attr', 'r_attr'),
                                 expected_pairs)
    def test_candset_with_join_attr_of_type_int(self):
        A = pd.DataFrame([{'l_id': 1, 'l_attr':1990},
                          {'l_id': 2, 'l_attr':2000},
                          {'l_id': 3, 'l_attr':0},
                          {'l_id': 4, 'l_attr':-1},
                          {'l_id': 5, 'l_attr':1986}])
        B = pd.DataFrame([{'r_id': 1, 'r_attr':2001},
                          {'r_id': 2, 'r_attr':1992},
                          {'r_id': 3, 'r_attr':1886},
                          {'r_id': 4, 'r_attr':2007},
                          {'r_id': 5, 'r_attr':2012}])

        dataframe_column_to_str(A, 'l_attr', inplace=True)              
        dataframe_column_to_str(B, 'r_attr', inplace=True)  

        A['tmp_join_key'] = 1
        B['tmp_join_key'] = 1
        C = pd.merge(A[['l_id', 'tmp_join_key']],
                     B[['r_id', 'tmp_join_key']],
                 on='tmp_join_key').drop('tmp_join_key', 1)

        qg2_tok = QgramTokenizer(2, return_set=True)
        expected_pairs = set(['1,2', '1,3', '2,1', '2,4', '2,5',
                              '4,1', '5,2', '5,3'])
        self.test_filter_candset(qg2_tok, 1, '>=', False,
                                 (C, 'l_id', 'r_id',
                                  A, B, 'l_id', 'r_id',
                                  'l_attr', 'r_attr'),
                                 expected_pairs)
 def test_int_col_with_inplace(self):                                        
     assert_equal(self.dataframe['int_col'].dtype, int)                   
     flag = dataframe_column_to_str(self.dataframe, 'int_col',                           
                                    inplace=True, return_col=False)                      
     assert_equal(flag, True)                                                
     assert_equal(self.dataframe['int_col'].dtype, object)                   
     assert_equal(sum(pd.isnull(self.dataframe['int_col'])), 0)                             
 def test_int_col(self):                                                     
     assert_equal(self.dataframe['int_col'].dtype, int)                   
     out_df = dataframe_column_to_str(self.dataframe, 'int_col',                         
                                      inplace=False, return_col=False)                   
     assert_equal(type(out_df), pd.DataFrame)                                
     assert_equal(out_df['int_col'].dtype, object)
     assert_equal(self.dataframe['int_col'].dtype, int)                     
     assert_equal(sum(pd.isnull(out_df['int_col'])), 0)
 def test_int_col_with_return_col(self):                                     
     assert_equal(self.dataframe['int_col'].dtype, int)                   
     out_series = dataframe_column_to_str(self.dataframe, 'int_col',                     
                                          inplace=False, return_col=True)                
     assert_equal(type(out_series), pd.Series)                               
     assert_equal(out_series.dtype, object)                                  
     assert_equal(self.dataframe['int_col'].dtype, int)                   
     assert_equal(sum(pd.isnull(out_series)), 0) 
Beispiel #7
0
    def test_jac_qg2_with_filter_attr_of_type_int(self):
        A = pd.DataFrame([{
            'l_id': 1,
            'l_attr': 1990
        }, {
            'l_id': 2,
            'l_attr': 2000
        }, {
            'l_id': 3,
            'l_attr': 0
        }, {
            'l_id': 4,
            'l_attr': -1
        }, {
            'l_id': 5,
            'l_attr': 1986
        }])
        B = pd.DataFrame([{
            'r_id': 1,
            'r_attr': 2001
        }, {
            'r_id': 2,
            'r_attr': 1992
        }, {
            'r_id': 3,
            'r_attr': 1886
        }, {
            'r_id': 4,
            'r_attr': 2007
        }, {
            'r_id': 5,
            'r_attr': 2012
        }])

        dataframe_column_to_str(A, 'l_attr', inplace=True)
        dataframe_column_to_str(B, 'r_attr', inplace=True)

        qg2_tok = QgramTokenizer(2, return_set=True)
        expected_pairs = set([
            '1,1', '1,2', '1,3', '1,4', '1,5', '2,1', '2,2', '2,3', '2,4',
            '2,5', '5,1', '5,2', '5,3', '5,4', '5,5'
        ])
        self.test_filter_tables(qg2_tok, 'JACCARD', 0.8, False, False,
                                (A, B, 'l_id', 'r_id', 'l_attr', 'r_attr'),
                                expected_pairs)
    def setUp(self):
        ltable_path = os.sep.join(['data', 'table_A.csv'])
        rtable_path = os.sep.join(['data', 'table_B.csv'])
        # load input tables for the tests.
        self.ltable = pd.read_csv(os.path.join(os.path.dirname(__file__),
                                          ltable_path))
        self.rtable = pd.read_csv(os.path.join(os.path.dirname(__file__),
                                          rtable_path))

        self.l_key_attr = 'A.ID'
        self.r_key_attr = 'B.ID'
        self.l_join_attr = 'A.name'
        self.r_join_attr = 'B.name'

        # convert zipcode column to string
        dataframe_column_to_str(self.ltable, 'A.zipcode', inplace=True)              
        dataframe_column_to_str(self.rtable, 'B.zipcode', inplace=True) 

        # copy of tables without removing any rows with missing value.
        # needed to test allow_missing option.
        self.orig_ltable = self.ltable.copy()
        self.orig_rtable = self.rtable.copy()

        # remove rows with null value in join attribute 
        self.ltable = self.ltable[pd.notnull(
                          self.ltable[self.l_join_attr])]
        self.rtable = self.rtable[pd.notnull(
                          self.rtable[self.r_join_attr])]

        # generate cartesian product to be used as candset
        self.ltable['tmp_join_key'] = 1
        self.rtable['tmp_join_key'] = 1
        self.cartprod = pd.merge(self.ltable[[
                                    self.l_key_attr,
                                    self.l_join_attr,
                                    'A.zipcode',
                                    'tmp_join_key']],
                                 self.rtable[[
                                    self.r_key_attr,
                                    self.r_join_attr,
                                    'B.zipcode',
                                    'tmp_join_key']],
                        on='tmp_join_key').drop('tmp_join_key', 1)
        self.ltable.drop('tmp_join_key', 1)
        self.rtable.drop('tmp_join_key', 1)
 def test_str_col_with_inplace(self):                                      
     assert_equal(self.dataframe['str_col'].dtype, object)                  
     nan_cnt_before = sum(pd.isnull(self.dataframe['str_col']))            
     flag = dataframe_column_to_str(self.dataframe, 'str_col',                         
                                    inplace=True, return_col=False)                      
     assert_equal(flag, True)                                                
     assert_equal(self.dataframe['str_col'].dtype, object)                 
     nan_cnt_after = sum(pd.isnull(self.dataframe['str_col']))             
     assert_equal(nan_cnt_before, nan_cnt_after)                             
 def test_int_col_with_inplace(self):
     assert_equal(self.dataframe['int_col'].dtype, int)
     flag = dataframe_column_to_str(self.dataframe,
                                    'int_col',
                                    inplace=True,
                                    return_col=False)
     assert_equal(flag, True)
     assert_equal(self.dataframe['int_col'].dtype, object)
     assert_equal(sum(pd.isnull(self.dataframe['int_col'])), 0)
    def test_jac_qg2_with_filter_attr_of_type_int(self):
        A = pd.DataFrame([{'l_id': 1, 'l_attr':1990},
                          {'l_id': 2, 'l_attr':2000},
                          {'l_id': 3, 'l_attr':0},
                          {'l_id': 4, 'l_attr':-1},
                          {'l_id': 5, 'l_attr':1986}])
        B = pd.DataFrame([{'r_id': 1, 'r_attr':2001},
                          {'r_id': 2, 'r_attr':1992},
                          {'r_id': 3, 'r_attr':1886},
                          {'r_id': 4, 'r_attr':2007},
                          {'r_id': 5, 'r_attr':2012}])

        dataframe_column_to_str(A, 'l_attr', inplace=True)                      
        dataframe_column_to_str(B, 'r_attr', inplace=True)

        qg2_tok = QgramTokenizer(2, return_set=True)
        self.test_filter_tables(qg2_tok, 'JACCARD', 0.3, False, False,
                                (A, B,
                                'l_id', 'r_id', 'l_attr', 'r_attr'))
    def test_jac_qg2_with_filter_attr_of_type_int(self):
        A = pd.DataFrame([{'l_id': 1, 'l_attr':1990},
                          {'l_id': 2, 'l_attr':2000},
                          {'l_id': 3, 'l_attr':0},
                          {'l_id': 4, 'l_attr':-1},
                          {'l_id': 5, 'l_attr':1986}])
        B = pd.DataFrame([{'r_id': 1, 'r_attr':2001},
                          {'r_id': 2, 'r_attr':1992},
                          {'r_id': 3, 'r_attr':1886},
                          {'r_id': 4, 'r_attr':2007},
                          {'r_id': 5, 'r_attr':2012}])

        dataframe_column_to_str(A, 'l_attr', inplace=True)                      
        dataframe_column_to_str(B, 'r_attr', inplace=True)

        qg2_tok = QgramTokenizer(2, return_set=True)
        self.test_filter_tables(qg2_tok, 'JACCARD', 0.3, False, False,
                                (A, B,
                                'l_id', 'r_id', 'l_attr', 'r_attr'))
 def test_int_col_with_return_col(self):
     assert_equal(self.dataframe['int_col'].dtype, int)
     out_series = dataframe_column_to_str(self.dataframe,
                                          'int_col',
                                          inplace=False,
                                          return_col=True)
     assert_equal(type(out_series), pd.Series)
     assert_equal(out_series.dtype, object)
     assert_equal(self.dataframe['int_col'].dtype, int)
     assert_equal(sum(pd.isnull(out_series)), 0)
 def test_int_col(self):
     assert_equal(self.dataframe['int_col'].dtype, int)
     out_df = dataframe_column_to_str(self.dataframe,
                                      'int_col',
                                      inplace=False,
                                      return_col=False)
     assert_equal(type(out_df), pd.DataFrame)
     assert_equal(out_df['int_col'].dtype, object)
     assert_equal(self.dataframe['int_col'].dtype, int)
     assert_equal(sum(pd.isnull(out_df['int_col'])), 0)
 def test_str_col_with_return_col(self):                                   
     assert_equal(self.dataframe['str_col'].dtype, object)                  
     nan_cnt_before = sum(pd.isnull(self.dataframe['str_col']))            
     out_series = dataframe_column_to_str(self.dataframe, 'str_col',                   
                                          inplace=False, return_col=True)                
     assert_equal(type(out_series), pd.Series)                               
     assert_equal(out_series.dtype, object)                                  
     assert_equal(self.dataframe['str_col'].dtype, object)                  
     nan_cnt_after = sum(pd.isnull(out_series))                              
     assert_equal(nan_cnt_before, nan_cnt_after) 
 def test_str_col_with_inplace(self):
     assert_equal(self.dataframe['str_col'].dtype, object)
     nan_cnt_before = sum(pd.isnull(self.dataframe['str_col']))
     flag = dataframe_column_to_str(self.dataframe,
                                    'str_col',
                                    inplace=True,
                                    return_col=False)
     assert_equal(flag, True)
     assert_equal(self.dataframe['str_col'].dtype, object)
     nan_cnt_after = sum(pd.isnull(self.dataframe['str_col']))
     assert_equal(nan_cnt_before, nan_cnt_after)
Beispiel #17
0
def merge_candsets(candset_list, candset_l_key_attr, candset_r_key_attr, num_trees,        
                   vote_col='votes'):                                           
    print len(candset_list), candset_l_key_attr, candset_r_key_attr
    vote_cnt = {}                                                               
    for candset in candset_list:                                                
        # Find indices of l_key_attr and r_key_attr in candset                      
        candset_columns = list(candset.columns.values)                          
        candset_l_key_attr_index = candset_columns.index(candset_l_key_attr)    
        candset_r_key_attr_index = candset_columns.index(candset_r_key_attr)    
        dataframe_column_to_str(candset, candset_l_key_attr, inplace=True)
        dataframe_column_to_str(candset, candset_r_key_attr, inplace=True)      
        for candset_row in candset.itertuples(index=False):                     
            pair_id = str(candset_row[candset_l_key_attr_index])+','+str(candset_row[candset_r_key_attr_index])
            curr_votes = vote_cnt.get(pair_id, 0)                               
            vote_cnt[pair_id] = curr_votes + 1                                  
    output_rows = []                                                            
    for pair_id, votes in iteritems(vote_cnt):                            
        if votes >= (num_trees/2.0):      
            fields = pair_id.split(',')                                             
            output_rows.append([fields[0], fields[1], votes])                       
    return pd.DataFrame(output_rows, columns=['l_id', 'r_id', vote_col])        
 def test_str_col_with_return_col(self):
     assert_equal(self.dataframe['str_col'].dtype, object)
     nan_cnt_before = sum(pd.isnull(self.dataframe['str_col']))
     out_series = dataframe_column_to_str(self.dataframe,
                                          'str_col',
                                          inplace=False,
                                          return_col=True)
     assert_equal(type(out_series), pd.Series)
     assert_equal(out_series.dtype, object)
     assert_equal(self.dataframe['str_col'].dtype, object)
     nan_cnt_after = sum(pd.isnull(out_series))
     assert_equal(nan_cnt_before, nan_cnt_after)
    def setUp(self):
        ltable_path = os.sep.join(['data', 'table_A.csv'])
        rtable_path = os.sep.join(['data', 'table_B.csv'])
        # load input tables for the tests.
        self.ltable = pd.read_csv(
            os.path.join(os.path.dirname(__file__), ltable_path))
        self.rtable = pd.read_csv(
            os.path.join(os.path.dirname(__file__), rtable_path))

        self.l_key_attr = 'A.ID'
        self.r_key_attr = 'B.ID'
        self.l_join_attr = 'A.name'
        self.r_join_attr = 'B.name'

        # convert zipcode column to string
        dataframe_column_to_str(self.ltable, 'A.zipcode', inplace=True)
        dataframe_column_to_str(self.rtable, 'B.zipcode', inplace=True)

        # copy of tables without removing any rows with missing value.
        # needed to test allow_missing option.
        self.orig_ltable = self.ltable.copy()
        self.orig_rtable = self.rtable.copy()

        # remove rows with null value in join attribute
        self.ltable = self.ltable[pd.notnull(self.ltable[self.l_join_attr])]
        self.rtable = self.rtable[pd.notnull(self.rtable[self.r_join_attr])]

        # generate cartesian product to be used as candset
        self.ltable['tmp_join_key'] = 1
        self.rtable['tmp_join_key'] = 1
        self.cartprod = pd.merge(self.ltable[[
            self.l_key_attr, self.l_join_attr, 'A.zipcode', 'tmp_join_key'
        ]],
                                 self.rtable[[
                                     self.r_key_attr, self.r_join_attr,
                                     'B.zipcode', 'tmp_join_key'
                                 ]],
                                 on='tmp_join_key').drop('tmp_join_key', 1)
        self.ltable.drop('tmp_join_key', 1)
        self.rtable.drop('tmp_join_key', 1)
    def test_jac_qg2_with_filter_attr_of_type_int(self):
        A = pd.DataFrame([{'l_id': 1, 'l_attr':1990},
                          {'l_id': 2, 'l_attr':2000},
                          {'l_id': 3, 'l_attr':0},
                          {'l_id': 4, 'l_attr':-1},
                          {'l_id': 5, 'l_attr':1986}])
        B = pd.DataFrame([{'r_id': 1, 'r_attr':2001},
                          {'r_id': 2, 'r_attr':1992},
                          {'r_id': 3, 'r_attr':1886},
                          {'r_id': 4, 'r_attr':2007},
                          {'r_id': 5, 'r_attr':2012}])

        dataframe_column_to_str(A, 'l_attr', inplace=True)                      
        dataframe_column_to_str(B, 'r_attr', inplace=True) 

        qg2_tok = QgramTokenizer(2, return_set=True)
        expected_pairs = set(['1,1', '1,2', '1,3', '1,4', '1,5',
                              '2,1', '2,2', '2,3', '2,4', '2,5',
                              '5,1', '5,2', '5,3', '5,4', '5,5'])
        self.test_filter_tables(qg2_tok, 'JACCARD', 0.8, False, False,
                                (A, B,
                                'l_id', 'r_id', 'l_attr', 'r_attr'),
                                expected_pairs)
 def test_float_col_with_int_val(self):                                                   
     assert_equal(self.dataframe['float_col_with_int_val'].dtype, float)                  
     out_df = dataframe_column_to_str(
                          self.dataframe, 'float_col_with_int_val',                       
                          inplace=False, return_col=False)                   
     assert_equal(type(out_df), pd.DataFrame)                                
     assert_equal(out_df['float_col_with_int_val'].dtype, object)                         
     assert_equal(self.dataframe['float_col_with_int_val'].dtype, float)                  
     assert_equal(sum(pd.isnull(self.dataframe['float_col_with_int_val'])),               
                  sum(pd.isnull(out_df['float_col_with_int_val']))) 
     for idx, row in self.dataframe.iterrows():
         if pd.isnull(row['float_col_with_int_val']): 
             continue
         assert_equal(str(int(row['float_col_with_int_val'])),
                      out_df.ix[idx]['float_col_with_int_val'])
 def test_float_col_with_int_val(self):
     assert_equal(self.dataframe['float_col_with_int_val'].dtype, float)
     out_df = dataframe_column_to_str(self.dataframe,
                                      'float_col_with_int_val',
                                      inplace=False,
                                      return_col=False)
     assert_equal(type(out_df), pd.DataFrame)
     assert_equal(out_df['float_col_with_int_val'].dtype, object)
     assert_equal(self.dataframe['float_col_with_int_val'].dtype, float)
     assert_equal(sum(pd.isnull(self.dataframe['float_col_with_int_val'])),
                  sum(pd.isnull(out_df['float_col_with_int_val'])))
     for idx, row in self.dataframe.iterrows():
         if pd.isnull(row['float_col_with_int_val']):
             continue
         assert_equal(str(int(row['float_col_with_int_val'])),
                      out_df.ix[idx]['float_col_with_int_val'])
 def test_invalid_col_name(self):                                           
     dataframe_column_to_str(self.dataframe, 'invalid_col')
 def test_invalid_flag_combination(self):
     dataframe_column_to_str(self.dataframe,
                             'str_col',
                             inplace=True,
                             return_col=True)
 def test_invalid_return_col_flag(self):
     dataframe_column_to_str(self.dataframe,
                             'str_col',
                             inplace=True,
                             return_col=None)
 def test_invalid_inplace_flag(self):
     dataframe_column_to_str(self.dataframe, 'str_col', inplace=None)
 def test_invalid_col_name(self):
     dataframe_column_to_str(self.dataframe, 'invalid_col')
 def test_invalid_inplace_flag(self):                                            
     dataframe_column_to_str(self.dataframe, 'str_col', inplace=None)
 def test_invalid_return_col_flag(self):                                        
     dataframe_column_to_str(self.dataframe, 'str_col', 
                             inplace=True, return_col=None)
def test_valid_join(scenario, sim_measure_type, args, convert_to_str=False):
    (ltable_path, l_key_attr, l_join_attr) = scenario[0]
    (rtable_path, r_key_attr, r_join_attr) = scenario[1]
    join_fn = JOIN_FN_MAP[sim_measure_type]

    # load input tables for the tests.
    ltable = pd.read_csv(os.path.join(os.path.dirname(__file__),
                                      ltable_path))
    rtable = pd.read_csv(os.path.join(os.path.dirname(__file__),
                                      rtable_path))

    if convert_to_str:
        dataframe_column_to_str(ltable, l_join_attr, inplace=True)
        dataframe_column_to_str(rtable, r_join_attr, inplace=True)

    missing_pairs = set()
    # if allow_missing flag is set, compute missing pairs.
    if len(args) > 4 and args[4]:
        for l_idx, l_row in ltable.iterrows():
            for r_idx, r_row in rtable.iterrows(): 
                if (pd.isnull(l_row[l_join_attr]) or
                    pd.isnull(r_row[r_join_attr])):
                    missing_pairs.add(','.join((str(l_row[l_key_attr]),
                                                str(r_row[r_key_attr]))))

    # remove rows with missing value in join attribute and create new dataframes
    # consisting of rows with non-missing values.
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy()
    rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy()

    if len(args) > 3 and (not args[3]):
        ltable_not_missing = ltable_not_missing[ltable_not_missing.apply(
            lambda row: len(args[0].tokenize(str(row[l_join_attr]))), 1) > 0]
        rtable_not_missing = rtable_not_missing[rtable_not_missing.apply(
            lambda row: len(args[0].tokenize(str(row[r_join_attr]))), 1) > 0]

    # generate cartesian product to be used as candset
    ltable_not_missing['tmp_join_key'] = 1
    rtable_not_missing['tmp_join_key'] = 1
    cartprod = pd.merge(ltable_not_missing[[l_key_attr,
                                l_join_attr,
                                'tmp_join_key']],
                        rtable_not_missing[[r_key_attr,
                                r_join_attr,
                                'tmp_join_key']],
                        on='tmp_join_key').drop('tmp_join_key', 1)
    ltable_not_missing.drop('tmp_join_key', 1)
    rtable_not_missing.drop('tmp_join_key', 1)

    sim_func = get_sim_function(sim_measure_type)

    # apply sim function to the entire cartesian product to obtain
    # the expected set of pairs satisfying the threshold.
    cartprod['sim_score'] = cartprod.apply(lambda row: round(sim_func(
                args[0].tokenize(str(row[l_join_attr])),
                args[0].tokenize(str(row[r_join_attr]))), 4),
            axis=1)
   
    comp_fn = COMP_OP_MAP[DEFAULT_COMP_OP]
    # Check for comp_op in args.
    if len(args) > 2:
        comp_fn = COMP_OP_MAP[args[2]]

    expected_pairs = set()
    for idx, row in cartprod.iterrows():
        if comp_fn(float(row['sim_score']), args[1]):
            expected_pairs.add(','.join((str(row[l_key_attr]),
                                         str(row[r_key_attr]))))

    expected_pairs = expected_pairs.union(missing_pairs)

    orig_return_set_flag = args[0].get_return_set()

    # use join function to obtain actual output pairs.
    actual_candset = join_fn(ltable, rtable,
                             l_key_attr, r_key_attr,
                             l_join_attr, r_join_attr,
                             *args)

    assert_equal(args[0].get_return_set(), orig_return_set_flag)

    expected_output_attrs = ['_id']
    l_out_prefix = DEFAULT_L_OUT_PREFIX
    r_out_prefix = DEFAULT_R_OUT_PREFIX

    # Check for l_out_prefix in args.
    if len(args) > 7:
        l_out_prefix = args[7]
    expected_output_attrs.append(l_out_prefix + l_key_attr)

    # Check for r_out_prefix in args.
    if len(args) > 8:
        r_out_prefix = args[8]
    expected_output_attrs.append(r_out_prefix + r_key_attr)

    # Check for l_out_attrs in args.
    if len(args) > 5:
        if args[5]:
            l_out_attrs = remove_redundant_attrs(args[5], l_key_attr)
            for attr in l_out_attrs:
                expected_output_attrs.append(l_out_prefix + attr)

    # Check for r_out_attrs in args.
    if len(args) > 6:
        if args[6]:
            r_out_attrs = remove_redundant_attrs(args[6], r_key_attr)
            for attr in r_out_attrs:
                expected_output_attrs.append(r_out_prefix + attr)

    # Check for out_sim_score in args. 
    if len(args) > 9:
        if args[9]:
            expected_output_attrs.append('_sim_score')
    else:
        expected_output_attrs.append('_sim_score')

    # verify whether the output table has the necessary attributes.
    assert_list_equal(list(actual_candset.columns.values),
                      expected_output_attrs)

    actual_pairs = set()
    for idx, row in actual_candset.iterrows():
        actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                   str(row[r_out_prefix + r_key_attr]))))
   
    # verify whether the actual pairs and the expected pairs match.
    assert_equal(len(expected_pairs), len(actual_pairs))
    common_pairs = actual_pairs.intersection(expected_pairs)
    assert_equal(len(common_pairs), len(expected_pairs))
def test_valid_join(scenario,
                    tok,
                    threshold,
                    comp_op=DEFAULT_COMP_OP,
                    args=(),
                    convert_to_str=False):
    (ltable_path, l_key_attr, l_join_attr) = scenario[0]
    (rtable_path, r_key_attr, r_join_attr) = scenario[1]

    # load input tables for the tests.
    ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path))
    rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path))

    if convert_to_str:
        dataframe_column_to_str(ltable, l_join_attr, inplace=True)
        dataframe_column_to_str(rtable, r_join_attr, inplace=True)

    missing_pairs = set()
    # if allow_missing flag is set, compute missing pairs.
    if len(args) > 0 and args[0]:
        for l_idx, l_row in ltable.iterrows():
            for r_idx, r_row in rtable.iterrows():
                if (pd.isnull(l_row[l_join_attr])
                        or pd.isnull(r_row[r_join_attr])):
                    missing_pairs.add(','.join(
                        (str(l_row[l_key_attr]), str(r_row[r_key_attr]))))

    # remove rows with missing value in join attribute and create new dataframes
    # consisting of rows with non-missing values.
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy()
    rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy()

    # generate cartesian product to be used as candset
    ltable_not_missing['tmp_join_key'] = 1
    rtable_not_missing['tmp_join_key'] = 1
    cartprod = pd.merge(
        ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']],
        rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']],
        on='tmp_join_key').drop('tmp_join_key', 1)
    ltable_not_missing.drop('tmp_join_key', 1)
    rtable_not_missing.drop('tmp_join_key', 1)

    sim_measure_type = 'EDIT_DISTANCE'
    sim_func = get_sim_function(sim_measure_type)

    # apply sim function to the entire cartesian product to obtain
    # the expected set of pairs satisfying the threshold.
    cartprod['sim_score'] = cartprod.apply(
        lambda row: sim_func(str(row[l_join_attr]), str(row[r_join_attr])),
        axis=1)

    comp_fn = COMP_OP_MAP[comp_op]

    expected_pairs = set()
    overlap = get_sim_function('OVERLAP')
    for idx, row in cartprod.iterrows():
        l_tokens = tok.tokenize(str(row[l_join_attr]))
        r_tokens = tok.tokenize(str(row[r_join_attr]))

        if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0:
            continue

        # current edit distance join is approximate. It cannot find matching
        # strings which don't have any common q-grams. Hence, remove pairs
        # that don't have any common q-grams from expected pairs.
        if comp_fn(float(row['sim_score']), threshold):
            if overlap(l_tokens, r_tokens) > 0:
                expected_pairs.add(','.join(
                    (str(row[l_key_attr]), str(row[r_key_attr]))))

    expected_pairs = expected_pairs.union(missing_pairs)

    orig_return_set_flag = tok.get_return_set()

    # use join function to obtain actual output pairs.
    actual_candset = edit_distance_join(ltable,
                                        rtable,
                                        l_key_attr,
                                        r_key_attr,
                                        l_join_attr,
                                        r_join_attr,
                                        threshold,
                                        comp_op,
                                        *args,
                                        tokenizer=tok)

    assert_equal(tok.get_return_set(), orig_return_set_flag)

    expected_output_attrs = ['_id']
    l_out_prefix = DEFAULT_L_OUT_PREFIX
    r_out_prefix = DEFAULT_R_OUT_PREFIX

    # Check for l_out_prefix in args.
    if len(args) > 3:
        l_out_prefix = args[3]
    expected_output_attrs.append(l_out_prefix + l_key_attr)

    # Check for r_out_prefix in args.
    if len(args) > 4:
        r_out_prefix = args[4]
    expected_output_attrs.append(r_out_prefix + r_key_attr)

    # Check for l_out_attrs in args.
    if len(args) > 1:
        if args[1]:
            l_out_attrs = remove_redundant_attrs(args[1], l_key_attr)
            for attr in l_out_attrs:
                expected_output_attrs.append(l_out_prefix + attr)

    # Check for r_out_attrs in args.
    if len(args) > 2:
        if args[2]:
            r_out_attrs = remove_redundant_attrs(args[2], r_key_attr)
            for attr in r_out_attrs:
                expected_output_attrs.append(r_out_prefix + attr)

    # Check for out_sim_score in args.
    if len(args) > 5:
        if args[5]:
            expected_output_attrs.append('_sim_score')
    else:
        expected_output_attrs.append('_sim_score')

    # verify whether the output table has the necessary attributes.
    assert_list_equal(list(actual_candset.columns.values),
                      expected_output_attrs)

    actual_pairs = set()
    for idx, row in actual_candset.iterrows():
        actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                   str(row[r_out_prefix + r_key_attr]))))

    # verify whether the actual pairs and the expected pairs match.
    assert_equal(len(expected_pairs), len(actual_pairs))
    common_pairs = actual_pairs.intersection(expected_pairs)
    assert_equal(len(common_pairs), len(expected_pairs))
def test_valid_join(scenario, tok, threshold,comp_op=DEFAULT_COMP_OP, args=(),
                    convert_to_str=False,data_limit=100000,temp_dir = os.getcwd(), 
                    output_file_path = default_output_file_path):
    (ltable_path, l_key_attr, l_join_attr) = scenario[0]
    (rtable_path, r_key_attr, r_join_attr) = scenario[1]

    # load input tables for the tests.
    ltable = pd.read_csv(os.path.join(os.path.dirname(__file__),
                                      ltable_path))
    rtable = pd.read_csv(os.path.join(os.path.dirname(__file__),
                                      rtable_path))

    if convert_to_str:                                                          
        dataframe_column_to_str(ltable, l_join_attr, inplace=True)              
        dataframe_column_to_str(rtable, r_join_attr, inplace=True) 

    missing_pairs = set()
    # if allow_missing flag is set, compute missing pairs.
    if len(args) > 0 and args[0]:
        for l_idx, l_row in ltable.iterrows():
            for r_idx, r_row in rtable.iterrows():
                if (pd.isnull(l_row[l_join_attr]) or
                    pd.isnull(r_row[r_join_attr])):
                    missing_pairs.add(','.join((str(l_row[l_key_attr]),
                                                str(r_row[r_key_attr]))))

    # remove rows with missing value in join attribute and create new dataframes
    # consisting of rows with non-missing values.
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy()
    rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy()

    # generate cartesian product to be used as candset
    ltable_not_missing['tmp_join_key'] = 1
    rtable_not_missing['tmp_join_key'] = 1
    cartprod = pd.merge(ltable_not_missing[[l_key_attr,
                                l_join_attr,
                                'tmp_join_key']],
                        rtable_not_missing[[r_key_attr,
                                r_join_attr,
                                'tmp_join_key']],
                        on='tmp_join_key').drop('tmp_join_key', 1)
    ltable_not_missing.drop('tmp_join_key', 1)
    rtable_not_missing.drop('tmp_join_key', 1)

    sim_measure_type = 'EDIT_DISTANCE'
    sim_func = get_sim_function(sim_measure_type)

    # apply sim function to the entire cartesian product to obtain
    # the expected set of pairs satisfying the threshold.
    cartprod['sim_score'] = cartprod.apply(lambda row: sim_func(
                str(row[l_join_attr]), str(row[r_join_attr])),
            axis=1)

    comp_fn = COMP_OP_MAP[comp_op]

    expected_pairs = set()
    overlap = get_sim_function('OVERLAP')
    for idx, row in cartprod.iterrows():
        l_tokens = tok.tokenize(str(row[l_join_attr]))
        r_tokens = tok.tokenize(str(row[r_join_attr]))

        if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0:
            continue

        # current edit distance join is approximate. It cannot find matching
        # strings which don't have any common q-grams. Hence, remove pairs
        # that don't have any common q-grams from expected pairs.
        if comp_fn(float(row['sim_score']), threshold):
            if overlap(l_tokens, r_tokens) > 0:
                expected_pairs.add(','.join((str(row[l_key_attr]),
                                             str(row[r_key_attr]))))

    expected_pairs = expected_pairs.union(missing_pairs)

    orig_return_set_flag = tok.get_return_set()
    
    # Removing any previously existing output file path.
    if os.path.exists(output_file_path):
      os.remove(output_file_path)

    # Use join function to process the input data. It returns the boolean value.
    is_success = disk_edit_distance_join(ltable, rtable,
                                         l_key_attr, r_key_attr,
                                         l_join_attr, r_join_attr,
                                         threshold, data_limit,
                                         comp_op, *args,
                                         tokenizer=tok, temp_dir = temp_dir,
                                         output_file_path = output_file_path)
    # Use edit distance join without the disk version to get the dataframe to compare.
    no_disk_candset = edit_distance_join(ltable, rtable,
                                        l_key_attr, r_key_attr,
                                        l_join_attr, r_join_attr,
                                        threshold, comp_op,
                                        *args, tokenizer=tok)
    # Deleting Id to make the schema consistent for comparison.
    if '_id' in no_disk_candset :
      del no_disk_candset['_id']

    assert_equal(tok.get_return_set(), orig_return_set_flag)

    expected_output_attrs = []
    l_out_prefix = DEFAULT_L_OUT_PREFIX
    r_out_prefix = DEFAULT_R_OUT_PREFIX

    # Check for l_out_prefix in args.
    if len(args) > 3:
        l_out_prefix = args[3]
    expected_output_attrs.append(l_out_prefix + l_key_attr)

    # Check for r_out_prefix in args.
    if len(args) > 4:
        r_out_prefix = args[4]
    expected_output_attrs.append(r_out_prefix + r_key_attr)

    # Check for l_out_attrs in args.
    if len(args) > 1:
        if args[1]:
            l_out_attrs = remove_redundant_attrs(args[1], l_key_attr)
            for attr in l_out_attrs:
                expected_output_attrs.append(l_out_prefix + attr)

    # Check for r_out_attrs in args.
    if len(args) > 2:
        if args[2]:
            r_out_attrs = remove_redundant_attrs(args[2], r_key_attr)
            for attr in r_out_attrs:
                expected_output_attrs.append(r_out_prefix + attr)

    # Check for out_sim_score in args. 
    if len(args) > 5:
        if args[5]:
            expected_output_attrs.append('_sim_score')
    else:
        expected_output_attrs.append('_sim_score')

    # Verify whether the current output file path exists.
    assert_equal(True,os.path.exists(output_file_path))

    # verify whether the output table has the necessary attributes.
    actual_candset = pd.read_csv(output_file_path)

    # Comparing column header values
    assert_list_equal(list(actual_candset.columns.values),
                        expected_output_attrs)
    assert_list_equal(list(no_disk_candset.columns.values),
                        list(actual_candset.columns.values))

    actual_pairs = set()
    no_disk_pairs = set() 

    # Creating sets for comparing the data tuples
    for idx, row in actual_candset.iterrows():
        actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                     str(row[r_out_prefix + r_key_attr]))))

    for idx, row in no_disk_candset.iterrows():
        no_disk_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                     str(row[r_out_prefix + r_key_attr]))))
   
    # Verify whether the actual pairs and the expected pairs match.
    assert_equal(len(expected_pairs), len(actual_pairs))
    assert_equal(len(expected_pairs), len(no_disk_pairs))
    common_pairs = actual_pairs.intersection(expected_pairs)
    common_pairs_no_disk = no_disk_pairs.intersection(expected_pairs)
    assert_equal(len(common_pairs), len(expected_pairs))
    assert_equal(len(common_pairs_no_disk), len(expected_pairs))
 def test_invalid_flag_combination(self):                                        
     dataframe_column_to_str(self.dataframe, 'str_col', 
                             inplace=True, return_col=True)  
 def test_invalid_dataframe(self):
     dataframe_column_to_str([], 'test_col')
Beispiel #35
0
def test_valid_join(scenario, sim_measure_type, args, convert_to_str=False):
    (ltable_path, l_key_attr, l_join_attr) = scenario[0]
    (rtable_path, r_key_attr, r_join_attr) = scenario[1]
    join_fn = JOIN_FN_MAP[sim_measure_type]

    # load input tables for the tests.
    ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path))
    rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path))

    if convert_to_str:
        dataframe_column_to_str(ltable, l_join_attr, inplace=True)
        dataframe_column_to_str(rtable, r_join_attr, inplace=True)

    missing_pairs = set()
    # if allow_missing flag is set, compute missing pairs.
    if len(args) > 4 and args[4]:
        for l_idx, l_row in ltable.iterrows():
            for r_idx, r_row in rtable.iterrows():
                if (pd.isnull(l_row[l_join_attr])
                        or pd.isnull(r_row[r_join_attr])):
                    missing_pairs.add(','.join(
                        (str(l_row[l_key_attr]), str(r_row[r_key_attr]))))

    # remove rows with missing value in join attribute and create new dataframes
    # consisting of rows with non-missing values.
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy()
    rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy()

    if len(args) > 3 and (not args[3]):
        ltable_not_missing = ltable_not_missing[ltable_not_missing.apply(
            lambda row: len(args[0].tokenize(str(row[l_join_attr]))), 1) > 0]
        rtable_not_missing = rtable_not_missing[rtable_not_missing.apply(
            lambda row: len(args[0].tokenize(str(row[r_join_attr]))), 1) > 0]

    # generate cartesian product to be used as candset
    ltable_not_missing['tmp_join_key'] = 1
    rtable_not_missing['tmp_join_key'] = 1
    cartprod = pd.merge(
        ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']],
        rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']],
        on='tmp_join_key').drop('tmp_join_key', 1)
    ltable_not_missing.drop('tmp_join_key', 1)
    rtable_not_missing.drop('tmp_join_key', 1)

    sim_func = get_sim_function(sim_measure_type)

    # apply sim function to the entire cartesian product to obtain
    # the expected set of pairs satisfying the threshold.
    cartprod['sim_score'] = cartprod.apply(lambda row: round(
        sim_func(args[0].tokenize(str(row[l_join_attr])), args[0].tokenize(
            str(row[r_join_attr]))), 4),
                                           axis=1)

    comp_fn = COMP_OP_MAP[DEFAULT_COMP_OP]
    # Check for comp_op in args.
    if len(args) > 2:
        comp_fn = COMP_OP_MAP[args[2]]

    expected_pairs = set()
    for idx, row in cartprod.iterrows():
        if comp_fn(float(row['sim_score']), args[1]):
            expected_pairs.add(','.join(
                (str(row[l_key_attr]), str(row[r_key_attr]))))

    expected_pairs = expected_pairs.union(missing_pairs)

    orig_return_set_flag = args[0].get_return_set()

    # use join function to obtain actual output pairs.
    actual_candset = join_fn(ltable, rtable, l_key_attr, r_key_attr,
                             l_join_attr, r_join_attr, *args)

    assert_equal(args[0].get_return_set(), orig_return_set_flag)

    expected_output_attrs = ['_id']
    l_out_prefix = DEFAULT_L_OUT_PREFIX
    r_out_prefix = DEFAULT_R_OUT_PREFIX

    # Check for l_out_prefix in args.
    if len(args) > 7:
        l_out_prefix = args[7]
    expected_output_attrs.append(l_out_prefix + l_key_attr)

    # Check for r_out_prefix in args.
    if len(args) > 8:
        r_out_prefix = args[8]
    expected_output_attrs.append(r_out_prefix + r_key_attr)

    # Check for l_out_attrs in args.
    if len(args) > 5:
        if args[5]:
            l_out_attrs = remove_redundant_attrs(args[5], l_key_attr)
            for attr in l_out_attrs:
                expected_output_attrs.append(l_out_prefix + attr)

    # Check for r_out_attrs in args.
    if len(args) > 6:
        if args[6]:
            r_out_attrs = remove_redundant_attrs(args[6], r_key_attr)
            for attr in r_out_attrs:
                expected_output_attrs.append(r_out_prefix + attr)

    # Check for out_sim_score in args.
    if len(args) > 9:
        if args[9]:
            expected_output_attrs.append('_sim_score')
    else:
        expected_output_attrs.append('_sim_score')

    # verify whether the output table has the necessary attributes.
    assert_list_equal(list(actual_candset.columns.values),
                      expected_output_attrs)

    actual_pairs = set()
    for idx, row in actual_candset.iterrows():
        actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                   str(row[r_out_prefix + r_key_attr]))))

    # verify whether the actual pairs and the expected pairs match.
    assert_equal(len(expected_pairs), len(actual_pairs))
    common_pairs = actual_pairs.intersection(expected_pairs)
    assert_equal(len(common_pairs), len(expected_pairs))
 def test_invalid_dataframe(self):
     dataframe_column_to_str([], 'test_col')