def get_features(sim_measures=None, tokenizers=None): features = [] ws_tok = WhitespaceTokenizer(return_set=True) if sim_measures is None: sim_measures = [ 'JACCARD', 'COSINE', 'DICE', # 'LEFT_LENGTH', 'RIGHT_LENGTH', 'LENGTH_SUM', 'LENGTH_DIFF'] 'OVERLAP_COEFFICIENT', 'EDIT_DISTANCE', 'LEFT_LENGTH', 'RIGHT_LENGTH', 'LENGTH_SUM', 'LENGTH_DIFF' ] if tokenizers is None: tokenizers = { 'alph': AlphabeticTokenizer(return_set=True), 'alph_num': AlphanumericTokenizer(return_set=True), 'num': NumericTokenizer(return_set=True), 'ws': WhitespaceTokenizer(return_set=True), 'qg2': QgramTokenizer(qval=2, return_set=True), 'qg3': QgramTokenizer(qval=3, return_set=True) } for sim_measure_type in sim_measures: if sim_measure_type in [ 'EDIT_DISTANCE', 'LEFT_LENGTH', 'RIGHT_LENGTH', 'LENGTH_SUM', 'LENGTH_DIFF' ]: features.append( (sim_measure_type.lower(), 'none', sim_measure_type, None, get_sim_function(sim_measure_type))) continue for tok_name in tokenizers.keys(): # if sim_measure_type == 'COSINE' and tok_name == 'qg3': # continue features.append((sim_measure_type.lower() + '_' + tok_name, tok_name, sim_measure_type, tokenizers[tok_name], get_sim_function(sim_measure_type))) feature_table_header = [ 'feature_name', 'tokenizer_type', 'sim_measure_type', 'tokenizer', 'sim_function' ] feature_table = pd.DataFrame(features, columns=feature_table_header) feature_table = feature_table.set_index('feature_name') return feature_table
def test_invalid_candset(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 apply_matcher([], DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold)
def test_invalid_tokenizer(self): sim_func = get_sim_function('JACCARD') threshold = 0.3 apply_matcher(pd.DataFrame([], columns=['_id', 'l_A.ID', 'r_B.ID']), DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, sim_func, sim_func, threshold)
def test_invalid_rtable(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 apply_matcher(pd.DataFrame([], columns=['_id', 'l_A.ID', 'r_B.ID']), DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, [], self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold)
def test_apply_matcher_with_join_attr_of_type_int(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' l_join_attr = 'A.zipcode' r_join_attr = 'B.zipcode' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply( lambda row: sim_func(tok.tokenize(str(row[l_join_attr])), tok.tokenize(str(row[r_join_attr]))), axis=1) comp_fn = COMP_OP_MAP[comp_op] # compute expected output pairs expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join( (str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # use overlap filter to obtain a candset. overlap_filter = OverlapFilter(tok, 1, comp_op) candset = overlap_filter.filter_tables(self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, l_join_attr, r_join_attr) # apply a jaccard matcher to the candset output_candset = apply_matcher( candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, l_join_attr, r_join_attr, tok, sim_func, threshold) expected_output_attrs = [ '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score' ] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join( (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_invalid_tokenizer(self): sim_func = get_sim_function('JACCARD') threshold = 0.3 apply_matcher(pd.DataFrame([], columns=['_id', 'l_A.ID', 'r_B.ID']), DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, sim_func, sim_func, threshold)
def test_apply_matcher(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( tok.tokenize(str(row[self.l_join_attr])), tok.tokenize(str(row[self.r_join_attr]))), axis=1) comp_fn = COMP_OP_MAP[comp_op] # compute expected output pairs expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join((str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # use overlap filter to obtain a candset. overlap_filter = OverlapFilter(tok, 1, comp_op) candset = overlap_filter.filter_tables(self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr) # apply a jaccard matcher to the candset output_candset = apply_matcher(candset, DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, comp_op, False, [self.l_join_attr], [self.r_join_attr], out_sim_score=True) expected_output_attrs=['_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, DEFAULT_L_OUT_PREFIX + self.l_join_attr, DEFAULT_R_OUT_PREFIX + self.r_join_attr, '_sim_score'] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_invalid_candset(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 apply_matcher([], DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold)
def test_invalid_r_out_attr(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 apply_matcher(pd.DataFrame([], columns=['_id', 'l_A.ID', 'r_B.ID']), DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, r_out_attrs=['invalid_attr'])
def test_empty_candset(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 empty_candset = pd.DataFrame(columns=[ DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr ]) apply_matcher(empty_candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold)
def test_empty_candset(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 empty_candset = pd.DataFrame( columns=[DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr]) apply_matcher(empty_candset, DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold)
def _edit_dist_join_split(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score): # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_join_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_join_attr_index) sim_measure_type = 'EDIT_DISTANCE' # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # build a dictionary of l_join_attr lengths l_join_attr_dict = {} for row in ltable_dict.values(): l_join_attr_dict[row[l_key_attr_index]] = len(str( row[l_join_attr_index])) # Build prefix index on l_join_attr prefix_index = PrefixIndex(ltable_dict.values(), l_key_attr_index, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) prefix_index.build() prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(rtable_dict.keys())) for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_join_attr_index]) r_len = len(r_string) # check for empty string if not r_string: continue r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type) r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens, token_ordering) candidates = find_candidates_prefix_filter( r_ordered_tokens, len(r_ordered_tokens), prefix_filter, prefix_index) for cand in candidates: if r_len - threshold <= l_join_attr_dict[cand] <= r_len + threshold: edit_dist = sim_fn(str(ltable_dict[cand][l_join_attr_index]), r_string) if edit_dist <= threshold: if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices, r_out_attrs_indices) if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) else: output_row = [cand, r_id] if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def test_apply_matcher_with_allow_missing(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply( lambda row: sim_func(tok.tokenize(str(row[self.l_join_attr])), tok.tokenize(str(row[self.r_join_attr]))), axis=1) # compute expected output pairs comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join( (str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # find pairs that need to be included in output due to # the presence of missing value in one of the join attributes. missing_pairs = set() for l_idx, l_row in self.orig_ltable.iterrows(): for r_idx, r_row in self.orig_rtable.iterrows(): if (pd.isnull(l_row[self.l_join_attr]) or pd.isnull(r_row[self.r_join_attr])): missing_pairs.add(','.join((str(l_row[self.l_key_attr]), str(r_row[self.r_key_attr])))) # add the pairs containing missing value to the set of expected pairs. expected_pairs = expected_pairs.union(missing_pairs) # use overlap filter to obtain a candset with allow_missing set to True. overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True) candset = overlap_filter.filter_tables( self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr) # apply a jaccard matcher to the candset with allow_missing set to True. output_candset = apply_matcher(candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, comp_op, True, out_sim_score=True) expected_output_attrs = [ '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score' ] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join( (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def _edit_distance_join_split(ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform edit distance join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) sim_measure_type = 'EDIT_DISTANCE' # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_list, rtable_list], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # cache l_join_attr lengths l_join_attr_list = [] for row in ltable_list: l_join_attr_list.append(len(row[l_join_attr_index])) # Build prefix index on l_join_attr prefix_index = PrefixIndex(ltable_list, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) prefix_index.build(False) prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold) comp_fn = COMP_OP_MAP[comp_op] sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_len = len(r_string) r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # obtain candidates by applying prefix filter. candidates = prefix_filter.find_candidates(r_ordered_tokens, prefix_index) for cand in candidates: if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold: l_row = ltable_list[cand] # compute the actual edit distance edit_dist = sim_fn(l_row[l_join_attr_index], r_string) if comp_fn(edit_dist, threshold): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] # if out_sim_score flag is set, append the edit distance # score to the output record. if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _edit_distance_join_split(ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform edit distance join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) sim_measure_type = 'EDIT_DISTANCE' # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_list, rtable_list], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # cache l_join_attr lengths l_join_attr_list = [] for row in ltable_list: l_join_attr_list.append(len(row[l_join_attr_index])) # Build prefix index on l_join_attr prefix_index = PrefixIndex(ltable_list, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) prefix_index.build(False) prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold) comp_fn = COMP_OP_MAP[comp_op] sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_len = len(r_string) r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # obtain candidates by applying prefix filter. candidates = prefix_filter.find_candidates(r_ordered_tokens, prefix_index) for cand in candidates: if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold: l_row = ltable_list[cand] # compute the actual edit distance edit_dist = sim_fn(l_row[l_join_attr_index], r_string) if comp_fn(edit_dist, threshold): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ l_row[l_key_attr_index], r_row[r_key_attr_index] ] # if out_sim_score flag is set, append the edit distance # score to the output record. if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def set_sim_join(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, sim_measure_type, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform set similarity join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable, rtable], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # Build position index on l_join_attr position_index = PositionIndex(ltable, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) # While building the index, we cache the tokens and the empty records. # We cache the tokens so that we need not tokenize each string in # l_join_attr multiple times when we need to compute the similarity measure. # Further we cache the empty record ids to handle the allow_empty flag. cached_data = position_index.build(allow_empty, cache_tokens=True) l_empty_records = cached_data['empty_records'] cached_l_tokens = cached_data['cached_tokens'] pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable)) k = 0 for r_row in rtable: r_string = r_row[r_join_attr_index] # order the tokens using the token ordering. r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the position # index. if allow_empty and len(r_ordered_tokens) == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable[l_id][l_key_attr_index], r_row[r_key_attr_index] ] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # obtain candidates by applying position filter. candidate_overlap = pos_filter.find_candidates(r_ordered_tokens, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: l_ordered_tokens = cached_l_tokens[cand] k += 1 # compute the actual similarity score sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable[cand][l_key_attr_index], r_row[r_key_attr_index] ] # if out_sim_score flag is set, append the similarity score # to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() print 'k : ', k output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def test_valid_join(scenario, sim_measure_type, args): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] join_fn = JOIN_FN_MAP[sim_measure_type] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) # generate cartesian product to be used as candset ltable['tmp_join_key'] = 1 rtable['tmp_join_key'] = 1 cartprod = pd.merge(ltable[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable.drop('tmp_join_key', 1) rtable.drop('tmp_join_key', 1) sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( tokenize(str(row[l_join_attr]), args[0], sim_measure_type), tokenize(str(row[r_join_attr]), args[0], sim_measure_type)), axis=1) expected_pairs = set() for idx, row in cartprod.iterrows(): if float(row['sim_score']) >= args[1]: expected_pairs.add(','.join( (str(row[l_key_attr]), str(row[r_key_attr])))) # use join function to obtain actual output pairs. actual_candset = join_fn(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, *args) expected_output_attrs = ['_id'] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 4: l_out_prefix = args[4] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for l_out_attrs in args. if len(args) > 2: if args[2]: for attr in args[2]: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_prefix in args. if len(args) > 5: r_out_prefix = args[5] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for r_out_attrs in args. if len(args) > 3: if args[3]: for attr in args[3]: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 6: if args[6]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_filter_tables(self, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, args): suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) sim_fn = get_sim_function(sim_measure_type) # compute the join output pairs join_output_pairs = set() for l_idx, l_row in args[0].iterrows(): for r_idx, r_row in args[1].iterrows(): # if allow_missing is set to True, then add pairs containing # missing value to the join output. if pd.isnull(l_row[args[4]]) or pd.isnull(r_row[args[5]]): if allow_missing: join_output_pairs.add(','.join( (str(l_row[args[2]]), str(r_row[args[3]])))) continue if sim_measure_type == 'EDIT_DISTANCE': l_join_val = str(l_row[args[4]]) r_join_val = str(r_row[args[5]]) comp_fn = COMP_OP_MAP['<='] else: l_join_val = tokenizer.tokenize(str(l_row[args[4]])) r_join_val = tokenizer.tokenize(str(r_row[args[5]])) comp_fn = COMP_OP_MAP['>='] if (len(l_join_val) == 0 and len(r_join_val) == 0 and sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']): if allow_empty: join_output_pairs.add(','.join( (str(l_row[args[2]]), str(r_row[args[3]])))) continue # if both attributes are not missing and not empty, then check # if the pair satisfies the join condition. If yes, then add it # to the join output. if comp_fn(sim_fn(l_join_val, r_join_val), threshold): join_output_pairs.add(','.join( (str(l_row[args[2]]), str(r_row[args[3]])))) actual_candset = suffix_filter.filter_tables(*args) expected_output_attrs = ['_id'] l_out_prefix = self.default_l_out_prefix r_out_prefix = self.default_r_out_prefix # Check for l_out_prefix in args. if len(args) > 8: l_out_prefix = args[8] expected_output_attrs.append(l_out_prefix + args[2]) # Check for r_out_prefix in args. if len(args) > 9: r_out_prefix = args[9] expected_output_attrs.append(r_out_prefix + args[3]) # Check for l_out_attrs in args. if len(args) > 6: if args[6]: l_out_attrs = remove_redundant_attrs(args[6], args[2]) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 7: if args[7]: r_out_attrs = remove_redundant_attrs(args[7], args[3]) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(int(row[l_out_prefix + args[2]])), str(int(row[r_out_prefix + args[3]]))))) # verify whether all the join output pairs are # present in the actual output pairs common_pairs = actual_pairs.intersection(join_output_pairs) assert_equal(len(common_pairs), len(join_output_pairs))
def _set_sim_join_split(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, sim_measure_type, threshold, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score): """Perform set similarity join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_join_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_join_attr_index) # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # build a dictionary of tokenized l_join_attr l_join_attr_dict = {} for row in ltable_dict.values(): l_join_attr_dict[row[l_key_attr_index]] = order_using_token_ordering( tokenize(str(row[l_join_attr_index]), tokenizer, sim_measure_type), token_ordering) # Build position index on l_join_attr position_index = PositionIndex(ltable_dict.values(), l_key_attr_index, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) position_index.build() pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold) suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(rtable_dict.keys())) for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_join_attr_index]) # check for empty string if not r_string: continue r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type) r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens, token_ordering) r_num_tokens = len(r_ordered_tokens) r_prefix_length = get_prefix_length(r_num_tokens, sim_measure_type, threshold, tokenizer) candidate_overlap = find_candidates_position_filter( r_ordered_tokens, r_num_tokens, r_prefix_length, pos_filter, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: l_ordered_tokens = l_join_attr_dict[cand] l_num_tokens = position_index.get_size(cand) l_prefix_length = get_prefix_length( l_num_tokens, sim_measure_type, threshold, tokenizer) if not suffix_filter._filter_suffix( l_ordered_tokens[l_prefix_length:], r_ordered_tokens[r_prefix_length:], l_prefix_length, r_prefix_length, l_num_tokens, r_num_tokens): sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens) if sim_score >= threshold: if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices, r_out_attrs_indices) if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) else: output_row = [cand, r_id] if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def test_filter_tables(self, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, args): suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) sim_fn = get_sim_function(sim_measure_type) # compute the join output pairs join_output_pairs = set() for l_idx, l_row in args[0].iterrows(): for r_idx, r_row in args[1].iterrows(): # if allow_missing is set to True, then add pairs containing # missing value to the join output. if pd.isnull(l_row[args[4]]) or pd.isnull(r_row[args[5]]): if allow_missing: join_output_pairs.add(','.join((str(l_row[args[2]]), str(r_row[args[3]])))) continue if sim_measure_type == 'EDIT_DISTANCE': l_join_val = str(l_row[args[4]]) r_join_val = str(r_row[args[5]]) comp_fn = COMP_OP_MAP['<='] else: l_join_val = tokenizer.tokenize(str(l_row[args[4]])) r_join_val = tokenizer.tokenize(str(r_row[args[5]])) comp_fn = COMP_OP_MAP['>='] if (len(l_join_val) == 0 and len(r_join_val) == 0 and sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']): if allow_empty: join_output_pairs.add(','.join((str(l_row[args[2]]), str(r_row[args[3]])))) continue # if both attributes are not missing and not empty, then check # if the pair satisfies the join condition. If yes, then add it # to the join output. if comp_fn(sim_fn(l_join_val, r_join_val), threshold): join_output_pairs.add(','.join((str(l_row[args[2]]), str(r_row[args[3]])))) actual_candset = suffix_filter.filter_tables(*args) expected_output_attrs = ['_id'] l_out_prefix = self.default_l_out_prefix r_out_prefix = self.default_r_out_prefix # Check for l_out_prefix in args. if len(args) > 8: l_out_prefix = args[8] expected_output_attrs.append(l_out_prefix + args[2]) # Check for r_out_prefix in args. if len(args) > 9: r_out_prefix = args[9] expected_output_attrs.append(r_out_prefix + args[3]) # Check for l_out_attrs in args. if len(args) > 6: if args[6]: l_out_attrs = remove_redundant_attrs(args[6], args[2]) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 7: if args[7]: r_out_attrs = remove_redundant_attrs(args[7], args[3]) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(int(row[l_out_prefix + args[2]])), str(int(row[r_out_prefix + args[3]]))))) # verify whether all the join output pairs are # present in the actual output pairs common_pairs = actual_pairs.intersection(join_output_pairs) assert_equal(len(common_pairs), len(join_output_pairs))
def test_valid_join(scenario, sim_measure_type, args, convert_to_str=False): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] join_fn = JOIN_FN_MAP[sim_measure_type] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 4 and args[4]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join( (str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() if len(args) > 3 and (not args[3]): ltable_not_missing = ltable_not_missing[ltable_not_missing.apply( lambda row: len(args[0].tokenize(str(row[l_join_attr]))), 1) > 0] rtable_not_missing = rtable_not_missing[rtable_not_missing.apply( lambda row: len(args[0].tokenize(str(row[r_join_attr]))), 1) > 0] # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge( ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply(lambda row: round( sim_func(args[0].tokenize(str(row[l_join_attr])), args[0].tokenize( str(row[r_join_attr]))), 4), axis=1) comp_fn = COMP_OP_MAP[DEFAULT_COMP_OP] # Check for comp_op in args. if len(args) > 2: comp_fn = COMP_OP_MAP[args[2]] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), args[1]): expected_pairs.add(','.join( (str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = args[0].get_return_set() # use join function to obtain actual output pairs. actual_candset = join_fn(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, *args) assert_equal(args[0].get_return_set(), orig_return_set_flag) expected_output_attrs = ['_id'] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 7: l_out_prefix = args[7] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 8: r_out_prefix = args[8] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 5: if args[5]: l_out_attrs = remove_redundant_attrs(args[5], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 6: if args[6]: r_out_attrs = remove_redundant_attrs(args[6], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 9: if args[9]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def set_sim_join(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, sim_measure_type, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform set similarity join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable, rtable], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # Build position index on l_join_attr position_index = PositionIndex(ltable, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) # While building the index, we cache the tokens and the empty records. # We cache the tokens so that we need not tokenize each string in # l_join_attr multiple times when we need to compute the similarity measure. # Further we cache the empty record ids to handle the allow_empty flag. cached_data = position_index.build(allow_empty, cache_tokens=True) l_empty_records = cached_data['empty_records'] cached_l_tokens = cached_data['cached_tokens'] pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable)) k = 0 for r_row in rtable: r_string = r_row[r_join_attr_index] # order the tokens using the token ordering. r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the position # index. if allow_empty and len(r_ordered_tokens) == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[l_id][l_key_attr_index], r_row[r_key_attr_index]] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # obtain candidates by applying position filter. candidate_overlap = pos_filter.find_candidates(r_ordered_tokens, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: l_ordered_tokens = cached_l_tokens[cand] k += 1 # compute the actual similarity score sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[cand][l_key_attr_index], r_row[r_key_attr_index]] # if out_sim_score flag is set, append the similarity score # to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() print 'k : ', k output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def test_valid_join(scenario, tok, threshold, comp_op=DEFAULT_COMP_OP, args=(), convert_to_str=False): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 0 and args[0]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join( (str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge( ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_measure_type = 'EDIT_DISTANCE' sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply( lambda row: sim_func(str(row[l_join_attr]), str(row[r_join_attr])), axis=1) comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() overlap = get_sim_function('OVERLAP') for idx, row in cartprod.iterrows(): l_tokens = tok.tokenize(str(row[l_join_attr])) r_tokens = tok.tokenize(str(row[r_join_attr])) if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0: continue # current edit distance join is approximate. It cannot find matching # strings which don't have any common q-grams. Hence, remove pairs # that don't have any common q-grams from expected pairs. if comp_fn(float(row['sim_score']), threshold): if overlap(l_tokens, r_tokens) > 0: expected_pairs.add(','.join( (str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = tok.get_return_set() # use join function to obtain actual output pairs. actual_candset = edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, comp_op, *args, tokenizer=tok) assert_equal(tok.get_return_set(), orig_return_set_flag) expected_output_attrs = ['_id'] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 3: l_out_prefix = args[3] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 4: r_out_prefix = args[4] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 1: if args[1]: l_out_attrs = remove_redundant_attrs(args[1], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 2: if args[2]: r_out_attrs = remove_redundant_attrs(args[2], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 5: if args[5]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_valid_join(scenario, tok, threshold,comp_op=DEFAULT_COMP_OP, args=(), convert_to_str=False,data_limit=100000,temp_dir = os.getcwd(), output_file_path = default_output_file_path): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 0 and args[0]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join((str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge(ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_measure_type = 'EDIT_DISTANCE' sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( str(row[l_join_attr]), str(row[r_join_attr])), axis=1) comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() overlap = get_sim_function('OVERLAP') for idx, row in cartprod.iterrows(): l_tokens = tok.tokenize(str(row[l_join_attr])) r_tokens = tok.tokenize(str(row[r_join_attr])) if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0: continue # current edit distance join is approximate. It cannot find matching # strings which don't have any common q-grams. Hence, remove pairs # that don't have any common q-grams from expected pairs. if comp_fn(float(row['sim_score']), threshold): if overlap(l_tokens, r_tokens) > 0: expected_pairs.add(','.join((str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = tok.get_return_set() # Removing any previously existing output file path. if os.path.exists(output_file_path): os.remove(output_file_path) # Use join function to process the input data. It returns the boolean value. is_success = disk_edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, data_limit, comp_op, *args, tokenizer=tok, temp_dir = temp_dir, output_file_path = output_file_path) # Use edit distance join without the disk version to get the dataframe to compare. no_disk_candset = edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, comp_op, *args, tokenizer=tok) # Deleting Id to make the schema consistent for comparison. if '_id' in no_disk_candset : del no_disk_candset['_id'] assert_equal(tok.get_return_set(), orig_return_set_flag) expected_output_attrs = [] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 3: l_out_prefix = args[3] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 4: r_out_prefix = args[4] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 1: if args[1]: l_out_attrs = remove_redundant_attrs(args[1], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 2: if args[2]: r_out_attrs = remove_redundant_attrs(args[2], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 5: if args[5]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # Verify whether the current output file path exists. assert_equal(True,os.path.exists(output_file_path)) # verify whether the output table has the necessary attributes. actual_candset = pd.read_csv(output_file_path) # Comparing column header values assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) assert_list_equal(list(no_disk_candset.columns.values), list(actual_candset.columns.values)) actual_pairs = set() no_disk_pairs = set() # Creating sets for comparing the data tuples for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) for idx, row in no_disk_candset.iterrows(): no_disk_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # Verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) assert_equal(len(expected_pairs), len(no_disk_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) common_pairs_no_disk = no_disk_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs)) assert_equal(len(common_pairs_no_disk), len(expected_pairs))
def test_valid_join(scenario, sim_measure_type, args, convert_to_str=False): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] join_fn = JOIN_FN_MAP[sim_measure_type] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 4 and args[4]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join((str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() if len(args) > 3 and (not args[3]): ltable_not_missing = ltable_not_missing[ltable_not_missing.apply( lambda row: len(args[0].tokenize(str(row[l_join_attr]))), 1) > 0] rtable_not_missing = rtable_not_missing[rtable_not_missing.apply( lambda row: len(args[0].tokenize(str(row[r_join_attr]))), 1) > 0] # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge(ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply(lambda row: round(sim_func( args[0].tokenize(str(row[l_join_attr])), args[0].tokenize(str(row[r_join_attr]))), 4), axis=1) comp_fn = COMP_OP_MAP[DEFAULT_COMP_OP] # Check for comp_op in args. if len(args) > 2: comp_fn = COMP_OP_MAP[args[2]] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), args[1]): expected_pairs.add(','.join((str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = args[0].get_return_set() # use join function to obtain actual output pairs. actual_candset = join_fn(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, *args) assert_equal(args[0].get_return_set(), orig_return_set_flag) expected_output_attrs = ['_id'] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 7: l_out_prefix = args[7] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 8: r_out_prefix = args[8] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 5: if args[5]: l_out_attrs = remove_redundant_attrs(args[5], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 6: if args[6]: r_out_attrs = remove_redundant_attrs(args[6], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 9: if args[9]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_apply_matcher_with_allow_missing(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( tok.tokenize(str(row[self.l_join_attr])), tok.tokenize(str(row[self.r_join_attr]))), axis=1) # compute expected output pairs comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join((str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # find pairs that need to be included in output due to # the presence of missing value in one of the join attributes. missing_pairs = set() for l_idx, l_row in self.orig_ltable.iterrows(): for r_idx, r_row in self.orig_rtable.iterrows(): if (pd.isnull(l_row[self.l_join_attr]) or pd.isnull(r_row[self.r_join_attr])): missing_pairs.add(','.join((str(l_row[self.l_key_attr]), str(r_row[self.r_key_attr])))) # add the pairs containing missing value to the set of expected pairs. expected_pairs = expected_pairs.union(missing_pairs) # use overlap filter to obtain a candset with allow_missing set to True. overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True) candset = overlap_filter.filter_tables(self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr) # apply a jaccard matcher to the candset with allow_missing set to True. output_candset = apply_matcher(candset, DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, comp_op, True, out_sim_score=True) expected_output_attrs=['_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score'] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))