def test_edit_distance_join_invalid_tokenizer(self): edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', self.threshold, tokenizer=[])
def test_edit_distance_join_invalid_threshold_below(self): edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', -0.1)
def test_edit_distance_join_numeric_r_join_attr(self): edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.int_attr', self.threshold)
def test_edit_distance_join_invalid_rtable(self): edit_distance_join(self.A, [], 'A.id', 'B.id', 'A.attr', 'B.attr', self.threshold)
def test_edit_distance_join_invalid_r_join_attr(self): edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.invalid_attr', self.threshold)
def test_edit_distance_join_invalid_r_join_attr(self): edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.invalid_attr', self.threshold)
def test_valid_join(scenario, tok, threshold, comp_op=DEFAULT_COMP_OP, args=(), convert_to_str=False): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 0 and args[0]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join( (str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge( ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_measure_type = 'EDIT_DISTANCE' sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply( lambda row: sim_func(str(row[l_join_attr]), str(row[r_join_attr])), axis=1) comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() overlap = get_sim_function('OVERLAP') for idx, row in cartprod.iterrows(): l_tokens = tok.tokenize(str(row[l_join_attr])) r_tokens = tok.tokenize(str(row[r_join_attr])) if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0: continue # current edit distance join is approximate. It cannot find matching # strings which don't have any common q-grams. Hence, remove pairs # that don't have any common q-grams from expected pairs. if comp_fn(float(row['sim_score']), threshold): if overlap(l_tokens, r_tokens) > 0: expected_pairs.add(','.join( (str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = tok.get_return_set() # use join function to obtain actual output pairs. actual_candset = edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, comp_op, *args, tokenizer=tok) assert_equal(tok.get_return_set(), orig_return_set_flag) expected_output_attrs = ['_id'] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 3: l_out_prefix = args[3] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 4: r_out_prefix = args[4] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 1: if args[1]: l_out_attrs = remove_redundant_attrs(args[1], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 2: if args[2]: r_out_attrs = remove_redundant_attrs(args[2], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 5: if args[5]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def time_edit_distance_qg2_3(self): edit_distance_join(self.ltable, self.rtable, self.l_id_attr, self.r_id_attr, self.l_join_attr, self.r_join_attr, 3)
def test_valid_join(scenario, tok, threshold,comp_op=DEFAULT_COMP_OP, args=(), convert_to_str=False,data_limit=100000,temp_dir = os.getcwd(), output_file_path = default_output_file_path): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 0 and args[0]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join((str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge(ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_measure_type = 'EDIT_DISTANCE' sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( str(row[l_join_attr]), str(row[r_join_attr])), axis=1) comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() overlap = get_sim_function('OVERLAP') for idx, row in cartprod.iterrows(): l_tokens = tok.tokenize(str(row[l_join_attr])) r_tokens = tok.tokenize(str(row[r_join_attr])) if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0: continue # current edit distance join is approximate. It cannot find matching # strings which don't have any common q-grams. Hence, remove pairs # that don't have any common q-grams from expected pairs. if comp_fn(float(row['sim_score']), threshold): if overlap(l_tokens, r_tokens) > 0: expected_pairs.add(','.join((str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = tok.get_return_set() # Removing any previously existing output file path. if os.path.exists(output_file_path): os.remove(output_file_path) # Use join function to process the input data. It returns the boolean value. is_success = disk_edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, data_limit, comp_op, *args, tokenizer=tok, temp_dir = temp_dir, output_file_path = output_file_path) # Use edit distance join without the disk version to get the dataframe to compare. no_disk_candset = edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, comp_op, *args, tokenizer=tok) # Deleting Id to make the schema consistent for comparison. if '_id' in no_disk_candset : del no_disk_candset['_id'] assert_equal(tok.get_return_set(), orig_return_set_flag) expected_output_attrs = [] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 3: l_out_prefix = args[3] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 4: r_out_prefix = args[4] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 1: if args[1]: l_out_attrs = remove_redundant_attrs(args[1], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 2: if args[2]: r_out_attrs = remove_redundant_attrs(args[2], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 5: if args[5]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # Verify whether the current output file path exists. assert_equal(True,os.path.exists(output_file_path)) # verify whether the output table has the necessary attributes. actual_candset = pd.read_csv(output_file_path) # Comparing column header values assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) assert_list_equal(list(no_disk_candset.columns.values), list(actual_candset.columns.values)) actual_pairs = set() no_disk_pairs = set() # Creating sets for comparing the data tuples for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) for idx, row in no_disk_candset.iterrows(): no_disk_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # Verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) assert_equal(len(expected_pairs), len(no_disk_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) common_pairs_no_disk = no_disk_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs)) assert_equal(len(common_pairs_no_disk), len(expected_pairs))
def test_edit_distance_join_invalid_comp_op_ge(self): edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', self.threshold, '>=')
def test_edit_distance_join_invalid_r_out_attr(self): edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', self.threshold, self.comp_op, False, ['A.attr'], ['B.invalid_attr'])
def test_edit_distance_join_invalid_threshold_below(self): edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', -0.1)
def test_edit_distance_join_invalid_tokenizer(self): edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', self.threshold, tokenizer=[])
def test_edit_distance_join_numeric_r_join_attr(self): edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.int_attr', self.threshold)
def test_edit_distance_join_invalid_comp_op_ge(self): edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', self.threshold, '>=')
def time_edit_distance_qg2_3(self): edit_distance_join(self.ltable, self.rtable, self.l_id_attr, self.r_id_attr, self.l_join_attr, self.r_join_attr, 3)
def test_edit_distance_join_invalid_r_out_attr(self): edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', self.threshold, self.comp_op, False, ['A.attr'], ['B.invalid_attr'])
def test_edit_distance_join_invalid_rtable(self): edit_distance_join(self.A, [], 'A.id', 'B.id', 'A.attr', 'B.attr', self.threshold)