def merge_candsets(candset_list, candset_l_key_attr, candset_r_key_attr, num_trees, vote_col='votes'): print len(candset_list), candset_l_key_attr, candset_r_key_attr vote_cnt = {} for candset in candset_list: # Find indices of l_key_attr and r_key_attr in candset candset_columns = list(candset.columns.values) candset_l_key_attr_index = candset_columns.index(candset_l_key_attr) candset_r_key_attr_index = candset_columns.index(candset_r_key_attr) dataframe_column_to_str(candset, candset_l_key_attr, inplace=True) dataframe_column_to_str(candset, candset_r_key_attr, inplace=True) for candset_row in candset.itertuples(index=False): pair_id = str(candset_row[candset_l_key_attr_index]) + ',' + str( candset_row[candset_r_key_attr_index]) curr_votes = vote_cnt.get(pair_id, 0) vote_cnt[pair_id] = curr_votes + 1 output_rows = [] for pair_id, votes in iteritems(vote_cnt): if votes >= (num_trees / 2.0): fields = pair_id.split(',') output_rows.append([fields[0], fields[1], votes]) return pd.DataFrame(output_rows, columns=['l_id', 'r_id', vote_col])
def test_candset_with_join_attr_of_type_int(self): A = pd.DataFrame([{'l_id': 1, 'l_attr':1990}, {'l_id': 2, 'l_attr':2000}, {'l_id': 3, 'l_attr':0}, {'l_id': 4, 'l_attr':-1}, {'l_id': 5, 'l_attr':1986}]) B = pd.DataFrame([{'r_id': 1, 'r_attr':2001}, {'r_id': 2, 'r_attr':1992}, {'r_id': 3, 'r_attr':1886}, {'r_id': 4, 'r_attr':2007}, {'r_id': 5, 'r_attr':2012}]) dataframe_column_to_str(A, 'l_attr', inplace=True) dataframe_column_to_str(B, 'r_attr', inplace=True) A['tmp_join_key'] = 1 B['tmp_join_key'] = 1 C = pd.merge(A[['l_id', 'tmp_join_key']], B[['r_id', 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) qg2_tok = QgramTokenizer(2, return_set=True) expected_pairs = set(['1,2', '1,3', '2,1', '2,4', '2,5', '4,1', '5,2', '5,3']) self.test_filter_candset(qg2_tok, 1, '>=', False, (C, 'l_id', 'r_id', A, B, 'l_id', 'r_id', 'l_attr', 'r_attr'), expected_pairs)
def test_candset_with_join_attr_of_type_int(self): A = pd.DataFrame([{'l_id': 1, 'l_attr':1990}, {'l_id': 2, 'l_attr':2000}, {'l_id': 3, 'l_attr':0}, {'l_id': 4, 'l_attr':-1}, {'l_id': 5, 'l_attr':1986}]) B = pd.DataFrame([{'r_id': 1, 'r_attr':2001}, {'r_id': 2, 'r_attr':1992}, {'r_id': 3, 'r_attr':1886}, {'r_id': 4, 'r_attr':2007}, {'r_id': 5, 'r_attr':2012}]) dataframe_column_to_str(A, 'l_attr', inplace=True) dataframe_column_to_str(B, 'r_attr', inplace=True) A['tmp_join_key'] = 1 B['tmp_join_key'] = 1 C = pd.merge(A[['l_id', 'tmp_join_key']], B[['r_id', 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) qg2_tok = QgramTokenizer(2, return_set=True) expected_pairs = set(['1,2', '1,3', '2,1', '2,4', '2,5', '4,1', '5,2', '5,3']) self.test_filter_candset(qg2_tok, 1, '>=', False, (C, 'l_id', 'r_id', A, B, 'l_id', 'r_id', 'l_attr', 'r_attr'), expected_pairs)
def test_int_col_with_inplace(self): assert_equal(self.dataframe['int_col'].dtype, int) flag = dataframe_column_to_str(self.dataframe, 'int_col', inplace=True, return_col=False) assert_equal(flag, True) assert_equal(self.dataframe['int_col'].dtype, object) assert_equal(sum(pd.isnull(self.dataframe['int_col'])), 0)
def test_int_col(self): assert_equal(self.dataframe['int_col'].dtype, int) out_df = dataframe_column_to_str(self.dataframe, 'int_col', inplace=False, return_col=False) assert_equal(type(out_df), pd.DataFrame) assert_equal(out_df['int_col'].dtype, object) assert_equal(self.dataframe['int_col'].dtype, int) assert_equal(sum(pd.isnull(out_df['int_col'])), 0)
def test_int_col_with_return_col(self): assert_equal(self.dataframe['int_col'].dtype, int) out_series = dataframe_column_to_str(self.dataframe, 'int_col', inplace=False, return_col=True) assert_equal(type(out_series), pd.Series) assert_equal(out_series.dtype, object) assert_equal(self.dataframe['int_col'].dtype, int) assert_equal(sum(pd.isnull(out_series)), 0)
def test_jac_qg2_with_filter_attr_of_type_int(self): A = pd.DataFrame([{ 'l_id': 1, 'l_attr': 1990 }, { 'l_id': 2, 'l_attr': 2000 }, { 'l_id': 3, 'l_attr': 0 }, { 'l_id': 4, 'l_attr': -1 }, { 'l_id': 5, 'l_attr': 1986 }]) B = pd.DataFrame([{ 'r_id': 1, 'r_attr': 2001 }, { 'r_id': 2, 'r_attr': 1992 }, { 'r_id': 3, 'r_attr': 1886 }, { 'r_id': 4, 'r_attr': 2007 }, { 'r_id': 5, 'r_attr': 2012 }]) dataframe_column_to_str(A, 'l_attr', inplace=True) dataframe_column_to_str(B, 'r_attr', inplace=True) qg2_tok = QgramTokenizer(2, return_set=True) expected_pairs = set([ '1,1', '1,2', '1,3', '1,4', '1,5', '2,1', '2,2', '2,3', '2,4', '2,5', '5,1', '5,2', '5,3', '5,4', '5,5' ]) self.test_filter_tables(qg2_tok, 'JACCARD', 0.8, False, False, (A, B, 'l_id', 'r_id', 'l_attr', 'r_attr'), expected_pairs)
def setUp(self): ltable_path = os.sep.join(['data', 'table_A.csv']) rtable_path = os.sep.join(['data', 'table_B.csv']) # load input tables for the tests. self.ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) self.rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) self.l_key_attr = 'A.ID' self.r_key_attr = 'B.ID' self.l_join_attr = 'A.name' self.r_join_attr = 'B.name' # convert zipcode column to string dataframe_column_to_str(self.ltable, 'A.zipcode', inplace=True) dataframe_column_to_str(self.rtable, 'B.zipcode', inplace=True) # copy of tables without removing any rows with missing value. # needed to test allow_missing option. self.orig_ltable = self.ltable.copy() self.orig_rtable = self.rtable.copy() # remove rows with null value in join attribute self.ltable = self.ltable[pd.notnull( self.ltable[self.l_join_attr])] self.rtable = self.rtable[pd.notnull( self.rtable[self.r_join_attr])] # generate cartesian product to be used as candset self.ltable['tmp_join_key'] = 1 self.rtable['tmp_join_key'] = 1 self.cartprod = pd.merge(self.ltable[[ self.l_key_attr, self.l_join_attr, 'A.zipcode', 'tmp_join_key']], self.rtable[[ self.r_key_attr, self.r_join_attr, 'B.zipcode', 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) self.ltable.drop('tmp_join_key', 1) self.rtable.drop('tmp_join_key', 1)
def test_str_col_with_inplace(self): assert_equal(self.dataframe['str_col'].dtype, object) nan_cnt_before = sum(pd.isnull(self.dataframe['str_col'])) flag = dataframe_column_to_str(self.dataframe, 'str_col', inplace=True, return_col=False) assert_equal(flag, True) assert_equal(self.dataframe['str_col'].dtype, object) nan_cnt_after = sum(pd.isnull(self.dataframe['str_col'])) assert_equal(nan_cnt_before, nan_cnt_after)
def test_int_col_with_inplace(self): assert_equal(self.dataframe['int_col'].dtype, int) flag = dataframe_column_to_str(self.dataframe, 'int_col', inplace=True, return_col=False) assert_equal(flag, True) assert_equal(self.dataframe['int_col'].dtype, object) assert_equal(sum(pd.isnull(self.dataframe['int_col'])), 0)
def test_jac_qg2_with_filter_attr_of_type_int(self): A = pd.DataFrame([{'l_id': 1, 'l_attr':1990}, {'l_id': 2, 'l_attr':2000}, {'l_id': 3, 'l_attr':0}, {'l_id': 4, 'l_attr':-1}, {'l_id': 5, 'l_attr':1986}]) B = pd.DataFrame([{'r_id': 1, 'r_attr':2001}, {'r_id': 2, 'r_attr':1992}, {'r_id': 3, 'r_attr':1886}, {'r_id': 4, 'r_attr':2007}, {'r_id': 5, 'r_attr':2012}]) dataframe_column_to_str(A, 'l_attr', inplace=True) dataframe_column_to_str(B, 'r_attr', inplace=True) qg2_tok = QgramTokenizer(2, return_set=True) self.test_filter_tables(qg2_tok, 'JACCARD', 0.3, False, False, (A, B, 'l_id', 'r_id', 'l_attr', 'r_attr'))
def test_jac_qg2_with_filter_attr_of_type_int(self): A = pd.DataFrame([{'l_id': 1, 'l_attr':1990}, {'l_id': 2, 'l_attr':2000}, {'l_id': 3, 'l_attr':0}, {'l_id': 4, 'l_attr':-1}, {'l_id': 5, 'l_attr':1986}]) B = pd.DataFrame([{'r_id': 1, 'r_attr':2001}, {'r_id': 2, 'r_attr':1992}, {'r_id': 3, 'r_attr':1886}, {'r_id': 4, 'r_attr':2007}, {'r_id': 5, 'r_attr':2012}]) dataframe_column_to_str(A, 'l_attr', inplace=True) dataframe_column_to_str(B, 'r_attr', inplace=True) qg2_tok = QgramTokenizer(2, return_set=True) self.test_filter_tables(qg2_tok, 'JACCARD', 0.3, False, False, (A, B, 'l_id', 'r_id', 'l_attr', 'r_attr'))
def test_int_col_with_return_col(self): assert_equal(self.dataframe['int_col'].dtype, int) out_series = dataframe_column_to_str(self.dataframe, 'int_col', inplace=False, return_col=True) assert_equal(type(out_series), pd.Series) assert_equal(out_series.dtype, object) assert_equal(self.dataframe['int_col'].dtype, int) assert_equal(sum(pd.isnull(out_series)), 0)
def test_int_col(self): assert_equal(self.dataframe['int_col'].dtype, int) out_df = dataframe_column_to_str(self.dataframe, 'int_col', inplace=False, return_col=False) assert_equal(type(out_df), pd.DataFrame) assert_equal(out_df['int_col'].dtype, object) assert_equal(self.dataframe['int_col'].dtype, int) assert_equal(sum(pd.isnull(out_df['int_col'])), 0)
def test_str_col_with_return_col(self): assert_equal(self.dataframe['str_col'].dtype, object) nan_cnt_before = sum(pd.isnull(self.dataframe['str_col'])) out_series = dataframe_column_to_str(self.dataframe, 'str_col', inplace=False, return_col=True) assert_equal(type(out_series), pd.Series) assert_equal(out_series.dtype, object) assert_equal(self.dataframe['str_col'].dtype, object) nan_cnt_after = sum(pd.isnull(out_series)) assert_equal(nan_cnt_before, nan_cnt_after)
def test_str_col_with_inplace(self): assert_equal(self.dataframe['str_col'].dtype, object) nan_cnt_before = sum(pd.isnull(self.dataframe['str_col'])) flag = dataframe_column_to_str(self.dataframe, 'str_col', inplace=True, return_col=False) assert_equal(flag, True) assert_equal(self.dataframe['str_col'].dtype, object) nan_cnt_after = sum(pd.isnull(self.dataframe['str_col'])) assert_equal(nan_cnt_before, nan_cnt_after)
def merge_candsets(candset_list, candset_l_key_attr, candset_r_key_attr, num_trees, vote_col='votes'): print len(candset_list), candset_l_key_attr, candset_r_key_attr vote_cnt = {} for candset in candset_list: # Find indices of l_key_attr and r_key_attr in candset candset_columns = list(candset.columns.values) candset_l_key_attr_index = candset_columns.index(candset_l_key_attr) candset_r_key_attr_index = candset_columns.index(candset_r_key_attr) dataframe_column_to_str(candset, candset_l_key_attr, inplace=True) dataframe_column_to_str(candset, candset_r_key_attr, inplace=True) for candset_row in candset.itertuples(index=False): pair_id = str(candset_row[candset_l_key_attr_index])+','+str(candset_row[candset_r_key_attr_index]) curr_votes = vote_cnt.get(pair_id, 0) vote_cnt[pair_id] = curr_votes + 1 output_rows = [] for pair_id, votes in iteritems(vote_cnt): if votes >= (num_trees/2.0): fields = pair_id.split(',') output_rows.append([fields[0], fields[1], votes]) return pd.DataFrame(output_rows, columns=['l_id', 'r_id', vote_col])
def test_str_col_with_return_col(self): assert_equal(self.dataframe['str_col'].dtype, object) nan_cnt_before = sum(pd.isnull(self.dataframe['str_col'])) out_series = dataframe_column_to_str(self.dataframe, 'str_col', inplace=False, return_col=True) assert_equal(type(out_series), pd.Series) assert_equal(out_series.dtype, object) assert_equal(self.dataframe['str_col'].dtype, object) nan_cnt_after = sum(pd.isnull(out_series)) assert_equal(nan_cnt_before, nan_cnt_after)
def setUp(self): ltable_path = os.sep.join(['data', 'table_A.csv']) rtable_path = os.sep.join(['data', 'table_B.csv']) # load input tables for the tests. self.ltable = pd.read_csv( os.path.join(os.path.dirname(__file__), ltable_path)) self.rtable = pd.read_csv( os.path.join(os.path.dirname(__file__), rtable_path)) self.l_key_attr = 'A.ID' self.r_key_attr = 'B.ID' self.l_join_attr = 'A.name' self.r_join_attr = 'B.name' # convert zipcode column to string dataframe_column_to_str(self.ltable, 'A.zipcode', inplace=True) dataframe_column_to_str(self.rtable, 'B.zipcode', inplace=True) # copy of tables without removing any rows with missing value. # needed to test allow_missing option. self.orig_ltable = self.ltable.copy() self.orig_rtable = self.rtable.copy() # remove rows with null value in join attribute self.ltable = self.ltable[pd.notnull(self.ltable[self.l_join_attr])] self.rtable = self.rtable[pd.notnull(self.rtable[self.r_join_attr])] # generate cartesian product to be used as candset self.ltable['tmp_join_key'] = 1 self.rtable['tmp_join_key'] = 1 self.cartprod = pd.merge(self.ltable[[ self.l_key_attr, self.l_join_attr, 'A.zipcode', 'tmp_join_key' ]], self.rtable[[ self.r_key_attr, self.r_join_attr, 'B.zipcode', 'tmp_join_key' ]], on='tmp_join_key').drop('tmp_join_key', 1) self.ltable.drop('tmp_join_key', 1) self.rtable.drop('tmp_join_key', 1)
def test_jac_qg2_with_filter_attr_of_type_int(self): A = pd.DataFrame([{'l_id': 1, 'l_attr':1990}, {'l_id': 2, 'l_attr':2000}, {'l_id': 3, 'l_attr':0}, {'l_id': 4, 'l_attr':-1}, {'l_id': 5, 'l_attr':1986}]) B = pd.DataFrame([{'r_id': 1, 'r_attr':2001}, {'r_id': 2, 'r_attr':1992}, {'r_id': 3, 'r_attr':1886}, {'r_id': 4, 'r_attr':2007}, {'r_id': 5, 'r_attr':2012}]) dataframe_column_to_str(A, 'l_attr', inplace=True) dataframe_column_to_str(B, 'r_attr', inplace=True) qg2_tok = QgramTokenizer(2, return_set=True) expected_pairs = set(['1,1', '1,2', '1,3', '1,4', '1,5', '2,1', '2,2', '2,3', '2,4', '2,5', '5,1', '5,2', '5,3', '5,4', '5,5']) self.test_filter_tables(qg2_tok, 'JACCARD', 0.8, False, False, (A, B, 'l_id', 'r_id', 'l_attr', 'r_attr'), expected_pairs)
def test_float_col_with_int_val(self): assert_equal(self.dataframe['float_col_with_int_val'].dtype, float) out_df = dataframe_column_to_str( self.dataframe, 'float_col_with_int_val', inplace=False, return_col=False) assert_equal(type(out_df), pd.DataFrame) assert_equal(out_df['float_col_with_int_val'].dtype, object) assert_equal(self.dataframe['float_col_with_int_val'].dtype, float) assert_equal(sum(pd.isnull(self.dataframe['float_col_with_int_val'])), sum(pd.isnull(out_df['float_col_with_int_val']))) for idx, row in self.dataframe.iterrows(): if pd.isnull(row['float_col_with_int_val']): continue assert_equal(str(int(row['float_col_with_int_val'])), out_df.ix[idx]['float_col_with_int_val'])
def test_float_col_with_int_val(self): assert_equal(self.dataframe['float_col_with_int_val'].dtype, float) out_df = dataframe_column_to_str(self.dataframe, 'float_col_with_int_val', inplace=False, return_col=False) assert_equal(type(out_df), pd.DataFrame) assert_equal(out_df['float_col_with_int_val'].dtype, object) assert_equal(self.dataframe['float_col_with_int_val'].dtype, float) assert_equal(sum(pd.isnull(self.dataframe['float_col_with_int_val'])), sum(pd.isnull(out_df['float_col_with_int_val']))) for idx, row in self.dataframe.iterrows(): if pd.isnull(row['float_col_with_int_val']): continue assert_equal(str(int(row['float_col_with_int_val'])), out_df.ix[idx]['float_col_with_int_val'])
def test_invalid_col_name(self): dataframe_column_to_str(self.dataframe, 'invalid_col')
def test_invalid_flag_combination(self): dataframe_column_to_str(self.dataframe, 'str_col', inplace=True, return_col=True)
def test_invalid_return_col_flag(self): dataframe_column_to_str(self.dataframe, 'str_col', inplace=True, return_col=None)
def test_invalid_inplace_flag(self): dataframe_column_to_str(self.dataframe, 'str_col', inplace=None)
def test_invalid_col_name(self): dataframe_column_to_str(self.dataframe, 'invalid_col')
def test_invalid_inplace_flag(self): dataframe_column_to_str(self.dataframe, 'str_col', inplace=None)
def test_invalid_return_col_flag(self): dataframe_column_to_str(self.dataframe, 'str_col', inplace=True, return_col=None)
def test_valid_join(scenario, sim_measure_type, args, convert_to_str=False): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] join_fn = JOIN_FN_MAP[sim_measure_type] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 4 and args[4]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join((str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() if len(args) > 3 and (not args[3]): ltable_not_missing = ltable_not_missing[ltable_not_missing.apply( lambda row: len(args[0].tokenize(str(row[l_join_attr]))), 1) > 0] rtable_not_missing = rtable_not_missing[rtable_not_missing.apply( lambda row: len(args[0].tokenize(str(row[r_join_attr]))), 1) > 0] # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge(ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply(lambda row: round(sim_func( args[0].tokenize(str(row[l_join_attr])), args[0].tokenize(str(row[r_join_attr]))), 4), axis=1) comp_fn = COMP_OP_MAP[DEFAULT_COMP_OP] # Check for comp_op in args. if len(args) > 2: comp_fn = COMP_OP_MAP[args[2]] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), args[1]): expected_pairs.add(','.join((str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = args[0].get_return_set() # use join function to obtain actual output pairs. actual_candset = join_fn(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, *args) assert_equal(args[0].get_return_set(), orig_return_set_flag) expected_output_attrs = ['_id'] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 7: l_out_prefix = args[7] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 8: r_out_prefix = args[8] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 5: if args[5]: l_out_attrs = remove_redundant_attrs(args[5], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 6: if args[6]: r_out_attrs = remove_redundant_attrs(args[6], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 9: if args[9]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_valid_join(scenario, tok, threshold, comp_op=DEFAULT_COMP_OP, args=(), convert_to_str=False): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 0 and args[0]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join( (str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge( ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_measure_type = 'EDIT_DISTANCE' sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply( lambda row: sim_func(str(row[l_join_attr]), str(row[r_join_attr])), axis=1) comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() overlap = get_sim_function('OVERLAP') for idx, row in cartprod.iterrows(): l_tokens = tok.tokenize(str(row[l_join_attr])) r_tokens = tok.tokenize(str(row[r_join_attr])) if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0: continue # current edit distance join is approximate. It cannot find matching # strings which don't have any common q-grams. Hence, remove pairs # that don't have any common q-grams from expected pairs. if comp_fn(float(row['sim_score']), threshold): if overlap(l_tokens, r_tokens) > 0: expected_pairs.add(','.join( (str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = tok.get_return_set() # use join function to obtain actual output pairs. actual_candset = edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, comp_op, *args, tokenizer=tok) assert_equal(tok.get_return_set(), orig_return_set_flag) expected_output_attrs = ['_id'] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 3: l_out_prefix = args[3] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 4: r_out_prefix = args[4] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 1: if args[1]: l_out_attrs = remove_redundant_attrs(args[1], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 2: if args[2]: r_out_attrs = remove_redundant_attrs(args[2], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 5: if args[5]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_valid_join(scenario, tok, threshold,comp_op=DEFAULT_COMP_OP, args=(), convert_to_str=False,data_limit=100000,temp_dir = os.getcwd(), output_file_path = default_output_file_path): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 0 and args[0]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join((str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge(ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_measure_type = 'EDIT_DISTANCE' sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( str(row[l_join_attr]), str(row[r_join_attr])), axis=1) comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() overlap = get_sim_function('OVERLAP') for idx, row in cartprod.iterrows(): l_tokens = tok.tokenize(str(row[l_join_attr])) r_tokens = tok.tokenize(str(row[r_join_attr])) if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0: continue # current edit distance join is approximate. It cannot find matching # strings which don't have any common q-grams. Hence, remove pairs # that don't have any common q-grams from expected pairs. if comp_fn(float(row['sim_score']), threshold): if overlap(l_tokens, r_tokens) > 0: expected_pairs.add(','.join((str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = tok.get_return_set() # Removing any previously existing output file path. if os.path.exists(output_file_path): os.remove(output_file_path) # Use join function to process the input data. It returns the boolean value. is_success = disk_edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, data_limit, comp_op, *args, tokenizer=tok, temp_dir = temp_dir, output_file_path = output_file_path) # Use edit distance join without the disk version to get the dataframe to compare. no_disk_candset = edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, comp_op, *args, tokenizer=tok) # Deleting Id to make the schema consistent for comparison. if '_id' in no_disk_candset : del no_disk_candset['_id'] assert_equal(tok.get_return_set(), orig_return_set_flag) expected_output_attrs = [] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 3: l_out_prefix = args[3] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 4: r_out_prefix = args[4] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 1: if args[1]: l_out_attrs = remove_redundant_attrs(args[1], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 2: if args[2]: r_out_attrs = remove_redundant_attrs(args[2], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 5: if args[5]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # Verify whether the current output file path exists. assert_equal(True,os.path.exists(output_file_path)) # verify whether the output table has the necessary attributes. actual_candset = pd.read_csv(output_file_path) # Comparing column header values assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) assert_list_equal(list(no_disk_candset.columns.values), list(actual_candset.columns.values)) actual_pairs = set() no_disk_pairs = set() # Creating sets for comparing the data tuples for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) for idx, row in no_disk_candset.iterrows(): no_disk_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # Verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) assert_equal(len(expected_pairs), len(no_disk_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) common_pairs_no_disk = no_disk_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs)) assert_equal(len(common_pairs_no_disk), len(expected_pairs))
def test_invalid_flag_combination(self): dataframe_column_to_str(self.dataframe, 'str_col', inplace=True, return_col=True)
def test_invalid_dataframe(self): dataframe_column_to_str([], 'test_col')
def test_valid_join(scenario, sim_measure_type, args, convert_to_str=False): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] join_fn = JOIN_FN_MAP[sim_measure_type] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 4 and args[4]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join( (str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() if len(args) > 3 and (not args[3]): ltable_not_missing = ltable_not_missing[ltable_not_missing.apply( lambda row: len(args[0].tokenize(str(row[l_join_attr]))), 1) > 0] rtable_not_missing = rtable_not_missing[rtable_not_missing.apply( lambda row: len(args[0].tokenize(str(row[r_join_attr]))), 1) > 0] # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge( ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply(lambda row: round( sim_func(args[0].tokenize(str(row[l_join_attr])), args[0].tokenize( str(row[r_join_attr]))), 4), axis=1) comp_fn = COMP_OP_MAP[DEFAULT_COMP_OP] # Check for comp_op in args. if len(args) > 2: comp_fn = COMP_OP_MAP[args[2]] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), args[1]): expected_pairs.add(','.join( (str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = args[0].get_return_set() # use join function to obtain actual output pairs. actual_candset = join_fn(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, *args) assert_equal(args[0].get_return_set(), orig_return_set_flag) expected_output_attrs = ['_id'] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 7: l_out_prefix = args[7] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 8: r_out_prefix = args[8] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 5: if args[5]: l_out_attrs = remove_redundant_attrs(args[5], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 6: if args[6]: r_out_attrs = remove_redundant_attrs(args[6], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 9: if args[9]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_invalid_dataframe(self): dataframe_column_to_str([], 'test_col')