def filter_pair(self, lstring, rstring): """Filter two strings with prefix filter. Args: lstring, rstring : input strings Returns: result : boolean, True if the tuple pair is dropped. """ # check for empty string if (not lstring) or (not rstring): return True ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type) rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type) token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) l_prefix_length = get_prefix_length(len(ordered_ltokens), self.sim_measure_type, self.threshold, self.tokenizer) r_prefix_length = get_prefix_length(len(ordered_rtokens), self.sim_measure_type, self.threshold, self.tokenizer) prefix_overlap = set(ordered_ltokens[0:l_prefix_length]).intersection( set(ordered_rtokens[0:r_prefix_length])) if len(prefix_overlap) > 0: return False else: return True
def test_find_candidates(self): # test default case (presence of candidates) tokens = order_using_token_ordering(['aa', 'ef', 'ab', 'cd'], token_ordering) self.assertSetEqual( self.position_filter.find_candidates(tokens, len(tokens), 0.8), set([0, 3])) # test empty set of candidates tokens = order_using_token_ordering(['op', 'lp', 'mp'], token_ordering) self.assertSetEqual( self.position_filter.find_candidates(tokens, len(tokens), 0.8), set()) # prefix index returns 2 candidates where as position index prunes them tokens = order_using_token_ordering(['aa', 'ef', 'lp'], token_ordering) self.assertSetEqual( self.position_filter.find_candidates(tokens, len(tokens), 0.8), set()) # test empty list of probe tokens tokens = order_using_token_ordering([], token_ordering) self.assertSetEqual( self.position_filter.find_candidates(tokens, len(tokens), 0.8), set())
def filter_pair(self, lstring, rstring): """Filter two strings with suffix filter. Args: lstring, rstring : input strings Returns: result : boolean, True if the tuple pair is dropped. """ # check for empty string if (not lstring) or (not rstring): return True ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type) rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type) token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) l_num_tokens = len(ordered_ltokens) r_num_tokens = len(ordered_rtokens) l_prefix_length = get_prefix_length(l_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) r_prefix_length = get_prefix_length(r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) return self._filter_suffix(ordered_ltokens[l_prefix_length:], ordered_rtokens[r_prefix_length:], l_prefix_length, r_prefix_length, len(ltokens), len(rtokens))
def filter_pair(self, lstring, rstring): """Filter two strings with position filter. Args: lstring, rstring : input strings Returns: result : boolean, True if the tuple pair is dropped. """ # check for empty string if (not lstring) or (not rstring): return True ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type) rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type) token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) l_num_tokens = len(ordered_ltokens) r_num_tokens = len(ordered_rtokens) l_prefix_length = get_prefix_length(l_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) r_prefix_length = get_prefix_length(r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) l_prefix_dict = {} l_pos = 0 for token in ordered_ltokens[0:l_prefix_length]: l_prefix_dict[token] = l_pos overlap_threshold = get_overlap_threshold(l_num_tokens, r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) current_overlap = 0 r_pos = 0 for token in ordered_rtokens[0:r_prefix_length]: l_pos = l_prefix_dict.get(token) if l_pos is not None: overlap_upper_bound = 1 + min(l_num_tokens - l_pos - 1, r_num_tokens - r_pos - 1) if (current_overlap + overlap_upper_bound) < overlap_threshold: return True current_overlap += 1 r_pos += 1 if current_overlap > 0: return False return True
def filter_pair(self, lstring, rstring): """Checks if the input strings get dropped by the suffix filter. Args: lstring,rstring (string): input strings Returns: A flag indicating whether the string pair is dropped (boolean). """ # If one of the inputs is missing, then check the allow_missing flag. # If it is set to True, then pass the pair. Else drop the pair. if pd.isnull(lstring) or pd.isnull(rstring): return (not self.allow_missing) # tokenize input strings ltokens = self.tokenizer.tokenize(lstring) rtokens = self.tokenizer.tokenize(rstring) l_num_tokens = len(ltokens) r_num_tokens = len(rtokens) if l_num_tokens == 0 and r_num_tokens == 0: if self.sim_measure_type == 'OVERLAP': return True elif self.sim_measure_type == 'EDIT_DISTANCE': return False else: return (not self.allow_empty) # order the tokens using the token ordering token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) # compute prefix length l_prefix_length = get_prefix_length(l_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) r_prefix_length = get_prefix_length(r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) if l_prefix_length <= 0 or r_prefix_length <= 0: return True return self._filter_suffix(ordered_ltokens[l_prefix_length:], ordered_rtokens[r_prefix_length:], l_prefix_length, r_prefix_length, l_num_tokens, r_num_tokens)
def test_apply_filter(self): # prefix filter satisfies l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['fg', 'cd', 'aa'], token_ordering) self.assertTrue( self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['aa'], token_ordering) self.assertTrue( self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # prefix filter doesn't satisfy l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['fg'], token_ordering) self.assertFalse( self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # test empty list of tokens l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering([], token_ordering) self.assertFalse( self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) self.assertFalse( self.prefix_filter.apply_filter(r_tokens, l_tokens, len(r_tokens), len(l_tokens), 0.8))
def build(self): for row in self.table: index_string = str(row[self.index_attr]) # check for empty string if not index_string: continue index_attr_tokens = order_using_token_ordering(tokenize( index_string, self.tokenizer, self.sim_measure_type), self.token_ordering) num_tokens = len(index_attr_tokens) prefix_length = get_prefix_length( num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) row_id = row[self.key_attr] pos = 0 for token in index_attr_tokens[0:prefix_length]: if self.index.get(token) is None: self.index[token] = [] self.index.get(token).append((row_id, pos)) pos += 1 self.size_map[row_id] = num_tokens return True
def build(self, cache_empty_records=True): """Build prefix index.""" self.index = {} empty_records = [] row_id = 0 for row in self.table: # tokenize string and order the tokens using the token ordering index_string = row[self.index_attr] index_attr_tokens = order_using_token_ordering( self.tokenizer.tokenize(index_string), self.token_ordering) # compute prefix length num_tokens = len(index_attr_tokens) prefix_length = get_prefix_length( num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) # update index for token in index_attr_tokens[0:prefix_length]: if self.index.get(token) is None: self.index[token] = [] self.index.get(token).append(row_id) if cache_empty_records and num_tokens == 0: empty_records.append(row_id) row_id += 1 return {'empty_records': empty_records}
def build(self, cache_empty_records=True): """Build prefix index.""" self.index = {} empty_records = [] row_id = 0 for row in self.table: # tokenize string and order the tokens using the token ordering index_string = row[self.index_attr] index_attr_tokens = order_using_token_ordering( self.tokenizer.tokenize(index_string), self.token_ordering) # compute prefix length num_tokens = len(index_attr_tokens) prefix_length = get_prefix_length(num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) # update index for token in index_attr_tokens[0:prefix_length]: if self.index.get(token) is None: self.index[token] = [] self.index.get(token).append(row_id) if cache_empty_records and num_tokens == 0: empty_records.append(row_id) row_id += 1 return {'empty_records': empty_records}
def test_find_candidates(self): # test default case (presence of candidates) tokens = order_using_token_ordering(['aa', 'ef', 'lp'], token_ordering) self.assertSetEqual( self.prefix_filter.find_candidates(tokens, len(tokens), 0.8), set([0, 3])) # test empty set of candidates tokens = order_using_token_ordering(['op', 'lp', 'mp'], token_ordering) self.assertSetEqual( self.prefix_filter.find_candidates(tokens, len(tokens), 0.8), set()) # test empty list of probe tokens tokens = order_using_token_ordering([], token_ordering) self.assertSetEqual( self.prefix_filter.find_candidates(tokens, len(tokens), 0.8), set())
def jaccard_join_auto(ltable, rtable, l_id_attr, l_join_attr, r_id_attr, r_join_attr, threshold, ltable_output_attrs=None, rtable_output_attrs=None): matches_list = [] sim_function = get_jaccard_fn() token_ordering = gen_token_ordering(ltable, l_join_attr) position_filter = PositionFilter(ltable, l_id_attr, l_join_attr, threshold, token_ordering, adaptive_prefix=True) position_filter.build_index() prog_bar = pyprind.ProgBar(len(rtable.index)) l_row_dict = {} for idx, l_row in ltable.iterrows(): l_id = l_row[l_id_attr] l_row_dict[l_id] = l_row r_row_dict = {} for idx, r_row in rtable.iterrows(): r_id = r_row[r_id_attr] r_row_dict[r_id] = r_row for r_id in r_row_dict.keys(): r_row = r_row_dict[r_id] r_tokens = order_using_token_ordering(list(r_row[r_join_attr]), token_ordering) r_num_tokens = len(r_tokens) l_cand_ids = position_filter.find_candidates(r_tokens, r_num_tokens, threshold) for l_id in l_cand_ids: l_row = l_row_dict[l_id] if sim_function(l_row[l_join_attr], r_row[r_join_attr]) >= threshold: match_dict = get_output_attributes(l_row, r_row, l_id_attr, l_id, r_id_attr, r_id, ltable_output_attrs, rtable_output_attrs) matches_list.append(match_dict) # matches_list.append(str(l_id)+','+str(r_id)) prog_bar.update() output_matches = pd.DataFrame(matches_list) return output_matches
def build(self, cache_empty_records=True, cache_tokens=False): """Build position index.""" self.index = {} self.size_cache = [] cached_tokens = [] empty_records = [] row_id = 0 for row in self.table: # tokenize string and order the tokens using the token ordering index_string = row[self.index_attr] index_attr_tokens = order_using_token_ordering( self.tokenizer.tokenize(index_string), self.token_ordering) # compute prefix length num_tokens = len(index_attr_tokens) prefix_length = get_prefix_length( num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) # update the index pos = 0 for token in index_attr_tokens[0:prefix_length]: if self.index.get(token) is None: self.index[token] = [] self.index.get(token).append((row_id, pos)) pos += 1 self.size_cache.append(num_tokens) # keep track of the max size and min size. if num_tokens < self.min_length: self.min_length = num_tokens if num_tokens > self.max_length: self.max_length = num_tokens # if cache_tokens flag is set to True, the store the tokens. if cache_tokens: cached_tokens.append(index_attr_tokens) if cache_empty_records and num_tokens == 0: empty_records.append(row_id) row_id += 1 return {'cached_tokens' : cached_tokens, 'empty_records' : empty_records}
def build_index(self): for row in self.table.itertuples(): token_list = list(row[self.join_attr]) ordered_token_list = order_using_token_ordering( token_list, self.token_ordering) num_tokens = len(ordered_token_list) prefix_length = int(num_tokens - ceil(self.threshold * num_tokens) + self.prefix_scheme) i = 0 for token in ordered_token_list: if i == prefix_length: break if self.prefix_index.get(token) is None: self.prefix_index[token] = [] self.prefix_index[token].append(row[self.id_attr])
def build_delta_indexes(self): self.delta_indexes.append({}) self.delta_indexes.append({}) num_delta_indexes = 1 for idx, row in self.table.iterrows(): id = row[self.id_attr] token_list = list(row[self.join_attr]) ordered_token_list = order_using_token_ordering( token_list, self.token_ordering) num_tokens = len(ordered_token_list) if num_tokens < self.min_len: self.min_len = num_tokens if num_tokens > self.max_len: self.max_len = num_tokens self.size_map[id] = num_tokens t = ceil(self.threshold * num_tokens) one_prefix_length = int(num_tokens - t + 1) i = 0 for i in xrange(0, one_prefix_length): if self.delta_indexes[1].get(ordered_token_list[i]) is None: self.delta_indexes[1][ordered_token_list[i]] = [] self.delta_indexes[1][ordered_token_list[i]].append((id, i)) j = 2 for i in xrange(one_prefix_length, num_tokens): if j > t: break if num_delta_indexes < j: self.delta_indexes.append({}) num_delta_indexes += 1 if self.delta_indexes[j].get(ordered_token_list[i]) is None: self.delta_indexes[j][ordered_token_list[i]] = [] self.delta_indexes[j][ordered_token_list[i]].append((id, i)) j += 1 self.avg_len = (self.min_len + self.max_len) / 2 return True
def build_index(self): if self.adaptive_prefix: return self.build_delta_indexes() for idx, row in self.table.iterrows(): id = row[self.id_attr] token_list = list(row[self.join_attr]) ordered_token_list = order_using_token_ordering( token_list, self.token_ordering) num_tokens = len(ordered_token_list) self.size_map[id] = num_tokens prefix_length = int(num_tokens - ceil(self.threshold * num_tokens) + self.prefix_scheme) i = 0 for token in ordered_token_list: if i == prefix_length: break if self.position_index.get(token) is None: self.position_index[token] = [] self.position_index[token].append((id, i)) return True
def _edit_distance_join_split(ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform edit distance join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) sim_measure_type = 'EDIT_DISTANCE' # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_list, rtable_list], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # cache l_join_attr lengths l_join_attr_list = [] for row in ltable_list: l_join_attr_list.append(len(row[l_join_attr_index])) # Build prefix index on l_join_attr prefix_index = PrefixIndex(ltable_list, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) prefix_index.build(False) prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold) comp_fn = COMP_OP_MAP[comp_op] sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_len = len(r_string) r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # obtain candidates by applying prefix filter. candidates = prefix_filter.find_candidates(r_ordered_tokens, prefix_index) for cand in candidates: if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold: l_row = ltable_list[cand] # compute the actual edit distance edit_dist = sim_fn(l_row[l_join_attr_index], r_string) if comp_fn(edit_dist, threshold): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] # if out_sim_score flag is set, append the edit distance # score to the output record. if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def filter_pair(self, lstring, rstring): """Checks if the input strings get dropped by the position filter. Args: lstring,rstring (string): input strings Returns: A flag indicating whether the string pair is dropped (boolean). """ # If one of the inputs is missing, then check the allow_missing flag. # If it is set to True, then pass the pair. Else drop the pair. if pd.isnull(lstring) or pd.isnull(rstring): return (not self.allow_missing) # tokenize input strings ltokens = self.tokenizer.tokenize(lstring) rtokens = self.tokenizer.tokenize(rstring) l_num_tokens = len(ltokens) r_num_tokens = len(rtokens) if l_num_tokens == 0 and r_num_tokens == 0: if self.sim_measure_type == 'OVERLAP': return True elif self.sim_measure_type == 'EDIT_DISTANCE': return False else: return (not self.allow_empty) token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) l_prefix_length = get_prefix_length(l_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) r_prefix_length = get_prefix_length(r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) if l_prefix_length <= 0 or r_prefix_length <= 0: return True l_prefix_dict = {} l_pos = 0 for token in ordered_ltokens[0:l_prefix_length]: l_prefix_dict[token] = l_pos overlap_threshold = get_overlap_threshold(l_num_tokens, r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) current_overlap = 0 r_pos = 0 for token in ordered_rtokens[0:r_prefix_length]: l_pos = l_prefix_dict.get(token) if l_pos is not None: overlap_upper_bound = 1 + min(l_num_tokens - l_pos - 1, r_num_tokens - r_pos - 1) if (current_overlap + overlap_upper_bound) < overlap_threshold: return True current_overlap += 1 r_pos += 1 if current_overlap > 0: return False return True
def _edit_distance_join_split(ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform edit distance join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) sim_measure_type = 'EDIT_DISTANCE' # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_list, rtable_list], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # cache l_join_attr lengths l_join_attr_list = [] for row in ltable_list: l_join_attr_list.append(len(row[l_join_attr_index])) # Build prefix index on l_join_attr prefix_index = PrefixIndex(ltable_list, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) prefix_index.build(False) prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold) comp_fn = COMP_OP_MAP[comp_op] sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_len = len(r_string) r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # obtain candidates by applying prefix filter. candidates = prefix_filter.find_candidates(r_ordered_tokens, prefix_index) for cand in candidates: if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold: l_row = ltable_list[cand] # compute the actual edit distance edit_dist = sim_fn(l_row[l_join_attr_index], r_string) if comp_fn(edit_dist, threshold): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ l_row[l_key_attr_index], r_row[r_key_attr_index] ] # if out_sim_score flag is set, append the edit distance # score to the output record. if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def filter_tables(self, ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_'): """Filter tables with suffix filter. Args: ltable, rtable : Pandas data frame l_key_attr, r_key_attr : String, key attribute from ltable and rtable l_filter_attr, r_filter_attr : String, filter attribute from ltable and rtable l_out_attrs, r_out_attrs : list of attribtues to be included in the output table from ltable and rtable l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table Returns: result : Pandas data frame """ # check if the input tables are dataframes validate_input_table(ltable, 'left table') validate_input_table(rtable, 'right table') # check if the key attributes and filter attributes exist validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table') validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table') validate_attr(l_filter_attr, ltable.columns, 'filter attribute', 'left table') validate_attr(r_filter_attr, rtable.columns, 'filter attribute', 'right table') # check if the output attributes exist validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs, rtable.columns) # check if the key attributes are unique and do not contain missing values validate_key_attr(l_key_attr, ltable, 'left table') validate_key_attr(r_key_attr, rtable, 'right table') # find column indices of key attr, filter attr and # output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = find_output_attribute_indices( l_columns, l_out_attrs) # find column indices of key attr, filter attr and # output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices( r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_filter_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_filter_attr_index) # generate token ordering using tokens in l_filter_attr # and r_filter_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_filter_attr_index, r_filter_attr_index], self.tokenizer, self.sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(ltable)) for l_row in ltable_dict.values(): l_id = l_row[l_key_attr_index] l_string = str(l_row[l_filter_attr_index]) # check for empty string if not l_string: continue ltokens = tokenize(l_string, self.tokenizer, self.sim_measure_type) ordered_ltokens = order_using_token_ordering( ltokens, token_ordering) l_num_tokens = len(ordered_ltokens) l_prefix_length = get_prefix_length(l_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) l_suffix = ordered_ltokens[l_prefix_length:] for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_filter_attr_index]) # check for empty string if not r_string: continue rtokens = tokenize(r_string, self.tokenizer, self.sim_measure_type) ordered_rtokens = order_using_token_ordering( rtokens, token_ordering) r_num_tokens = len(ordered_rtokens) r_prefix_length = get_prefix_length(r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) if not self._filter_suffix( l_suffix, ordered_rtokens[r_prefix_length:], l_prefix_length, r_prefix_length, l_num_tokens, r_num_tokens): if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[l_id], r_row, l_id, r_id, l_out_attrs_indices, r_out_attrs_indices) output_rows.append(output_row) else: output_rows.append([l_id, r_id]) prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) output_table.insert(0, '_id', range(0, len(output_table))) return output_table
def jaccard_join(ltable, rtable, l_id_attr, l_join_attr, r_id_attr, r_join_attr, threshold, ltable_output_attrs=None, rtable_output_attrs=None, filters=[]): if len(filters) == 0: return jaccard_join_auto(ltable, rtable, l_id_attr, l_join_attr, r_id_attr, r_join_attr, threshold, ltable_output_attrs, rtable_output_attrs) matches_list = [] sim_function = get_jaccard_fn() (index_filters, non_index_filters, token_ordering, need_ordering) = analyze_filters(filters) l_join_attr_token_dict = {} prog_bar = pyprind.ProgBar(len(rtable.index)) l_row_dict = {} for idx, l_row in ltable.iterrows(): l_id = l_row[l_id_attr] l_row_dict[l_id] = l_row l_join_attr_token_dict[l_id] = order_using_token_ordering( list(l_row[l_join_attr]), token_ordering) r_row_dict = {} for idx, r_row in rtable.iterrows(): r_id = r_row[r_id_attr] r_row_dict[r_id] = r_row for r_id in r_row_dict.keys(): r_row = r_row_dict[r_id] r_tokens = order_using_token_ordering(list(r_row[r_join_attr]), token_ordering) r_num_tokens = len(r_tokens) l_cand_ids = apply_index_filters(r_tokens, r_num_tokens, threshold, index_filters) for l_id in l_cand_ids: l_row = l_row_dict[l_id] l_tokens = l_join_attr_token_dict[l_id] if apply_non_index_filters(l_tokens, r_tokens, len(l_tokens), r_num_tokens, threshold, non_index_filters): if sim_function(l_row[l_join_attr], r_row[r_join_attr]) >= threshold: match_dict = get_output_attributes(l_row, r_row, l_id_attr, l_id, r_id_attr, r_id, ltable_output_attrs, rtable_output_attrs) matches_list.append(match_dict) prog_bar.update() output_matches = pd.DataFrame(matches_list) return output_matches
def _filter_tables_split(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, suffix_filter, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, show_progress): # find column indices of key attr, filter attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, filter attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # generate token ordering using tokens in l_filter_attr and r_filter_attr token_ordering = gen_token_ordering_for_tables( [ltable, rtable], [l_filter_attr_index, r_filter_attr_index], suffix_filter.tokenizer, suffix_filter.sim_measure_type) # ignore allow_empty flag for OVERLAP and EDIT_DISTANCE measures. handle_empty = (suffix_filter.allow_empty and suffix_filter.sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(ltable)) for l_row in ltable: l_string = l_row[l_filter_attr_index] ltokens = suffix_filter.tokenizer.tokenize(l_string) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) l_num_tokens = len(ordered_ltokens) l_prefix_length = get_prefix_length(l_num_tokens, suffix_filter.sim_measure_type, suffix_filter.threshold, suffix_filter.tokenizer) l_suffix = ordered_ltokens[l_prefix_length:] for r_row in rtable: r_string = r_row[r_filter_attr_index] rtokens = suffix_filter.tokenizer.tokenize(r_string) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) r_num_tokens = len(ordered_rtokens) # If allow_empty flag is set, then add the pair to the output. if handle_empty and l_num_tokens == 0 and r_num_tokens == 0: if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) continue r_prefix_length = get_prefix_length(r_num_tokens, suffix_filter.sim_measure_type, suffix_filter.threshold, suffix_filter.tokenizer) if l_prefix_length <= 0 or r_prefix_length <= 0: continue if not suffix_filter._filter_suffix(l_suffix, ordered_rtokens[r_prefix_length:], l_prefix_length, r_prefix_length, l_num_tokens, r_num_tokens): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _edit_dist_join_split(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score): # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_join_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_join_attr_index) sim_measure_type = 'EDIT_DISTANCE' # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # build a dictionary of l_join_attr lengths l_join_attr_dict = {} for row in ltable_dict.values(): l_join_attr_dict[row[l_key_attr_index]] = len(str( row[l_join_attr_index])) # Build prefix index on l_join_attr prefix_index = PrefixIndex(ltable_dict.values(), l_key_attr_index, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) prefix_index.build() prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(rtable_dict.keys())) for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_join_attr_index]) r_len = len(r_string) # check for empty string if not r_string: continue r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type) r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens, token_ordering) candidates = find_candidates_prefix_filter( r_ordered_tokens, len(r_ordered_tokens), prefix_filter, prefix_index) for cand in candidates: if r_len - threshold <= l_join_attr_dict[cand] <= r_len + threshold: edit_dist = sim_fn(str(ltable_dict[cand][l_join_attr_index]), r_string) if edit_dist <= threshold: if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices, r_out_attrs_indices) if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) else: output_row = [cand, r_id] if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _filter_tables_split(ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, prefix_filter, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix): # find column indices of key attr, filter attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = [] l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, filter attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_filter_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_filter_attr_index) # generate token ordering using tokens in l_filter_attr and r_filter_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_filter_attr_index, r_filter_attr_index], prefix_filter.tokenizer, prefix_filter.sim_measure_type) # Build prefix index on l_filter_attr prefix_index = PrefixIndex(ltable_dict.values(), l_key_attr_index, l_filter_attr_index, prefix_filter.tokenizer, prefix_filter.sim_measure_type, prefix_filter.threshold, token_ordering) prefix_index.build() output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(rtable)) for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_filter_attr_index]) # check for empty string if not r_string: continue r_filter_attr_tokens = tokenize(r_string, prefix_filter.tokenizer, prefix_filter.sim_measure_type) r_ordered_tokens = order_using_token_ordering(r_filter_attr_tokens, token_ordering) # probe prefix index and find candidates candidates = _find_candidates(r_ordered_tokens, len(r_ordered_tokens), prefix_filter, prefix_index) for cand in candidates: if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices, r_out_attrs_indices) output_rows.append(output_row) else: output_rows.append([cand, r_id]) prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def set_sim_join(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, sim_measure_type, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform set similarity join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable, rtable], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # Build position index on l_join_attr position_index = PositionIndex(ltable, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) # While building the index, we cache the tokens and the empty records. # We cache the tokens so that we need not tokenize each string in # l_join_attr multiple times when we need to compute the similarity measure. # Further we cache the empty record ids to handle the allow_empty flag. cached_data = position_index.build(allow_empty, cache_tokens=True) l_empty_records = cached_data['empty_records'] cached_l_tokens = cached_data['cached_tokens'] pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable)) k = 0 for r_row in rtable: r_string = r_row[r_join_attr_index] # order the tokens using the token ordering. r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the position # index. if allow_empty and len(r_ordered_tokens) == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable[l_id][l_key_attr_index], r_row[r_key_attr_index] ] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # obtain candidates by applying position filter. candidate_overlap = pos_filter.find_candidates(r_ordered_tokens, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: l_ordered_tokens = cached_l_tokens[cand] k += 1 # compute the actual similarity score sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable[cand][l_key_attr_index], r_row[r_key_attr_index] ] # if out_sim_score flag is set, append the similarity score # to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() print 'k : ', k output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def set_sim_join(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, sim_measure_type, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform set similarity join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable, rtable], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # Build position index on l_join_attr position_index = PositionIndex(ltable, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) # While building the index, we cache the tokens and the empty records. # We cache the tokens so that we need not tokenize each string in # l_join_attr multiple times when we need to compute the similarity measure. # Further we cache the empty record ids to handle the allow_empty flag. cached_data = position_index.build(allow_empty, cache_tokens=True) l_empty_records = cached_data['empty_records'] cached_l_tokens = cached_data['cached_tokens'] pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable)) k = 0 for r_row in rtable: r_string = r_row[r_join_attr_index] # order the tokens using the token ordering. r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the position # index. if allow_empty and len(r_ordered_tokens) == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[l_id][l_key_attr_index], r_row[r_key_attr_index]] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # obtain candidates by applying position filter. candidate_overlap = pos_filter.find_candidates(r_ordered_tokens, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: l_ordered_tokens = cached_l_tokens[cand] k += 1 # compute the actual similarity score sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[cand][l_key_attr_index], r_row[r_key_attr_index]] # if out_sim_score flag is set, append the similarity score # to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() print 'k : ', k output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _set_sim_join_split(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, sim_measure_type, threshold, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score): """Perform set similarity join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_join_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_join_attr_index) # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # build a dictionary of tokenized l_join_attr l_join_attr_dict = {} for row in ltable_dict.values(): l_join_attr_dict[row[l_key_attr_index]] = order_using_token_ordering( tokenize(str(row[l_join_attr_index]), tokenizer, sim_measure_type), token_ordering) # Build position index on l_join_attr position_index = PositionIndex(ltable_dict.values(), l_key_attr_index, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) position_index.build() pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold) suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(rtable_dict.keys())) for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_join_attr_index]) # check for empty string if not r_string: continue r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type) r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens, token_ordering) r_num_tokens = len(r_ordered_tokens) r_prefix_length = get_prefix_length(r_num_tokens, sim_measure_type, threshold, tokenizer) candidate_overlap = find_candidates_position_filter( r_ordered_tokens, r_num_tokens, r_prefix_length, pos_filter, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: l_ordered_tokens = l_join_attr_dict[cand] l_num_tokens = position_index.get_size(cand) l_prefix_length = get_prefix_length( l_num_tokens, sim_measure_type, threshold, tokenizer) if not suffix_filter._filter_suffix( l_ordered_tokens[l_prefix_length:], r_ordered_tokens[r_prefix_length:], l_prefix_length, r_prefix_length, l_num_tokens, r_num_tokens): sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens) if sim_score >= threshold: if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices, r_out_attrs_indices) if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) else: output_row = [cand, r_id] if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _filter_tables_split(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, position_filter, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, show_progress): # find column indices of key attr, filter attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = [] l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, filter attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # generate token ordering using tokens in l_filter_attr and r_filter_attr token_ordering = gen_token_ordering_for_tables( [ltable, rtable], [l_filter_attr_index, r_filter_attr_index], position_filter.tokenizer, position_filter.sim_measure_type) # ignore allow_empty flag for OVERLAP and EDIT_DISTANCE measures. handle_empty = (position_filter.allow_empty and position_filter.sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']) # Build position index on l_filter_attr position_index = PositionIndex(ltable, l_filter_attr_index, position_filter.tokenizer, position_filter.sim_measure_type, position_filter.threshold, token_ordering) # While building the index, we cache the record ids with empty set of # tokens. This is needed to handle the allow_empty flag. cached_data = position_index.build(handle_empty) l_empty_records = cached_data['empty_records'] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable)) for r_row in rtable: r_string = r_row[r_filter_attr_index] r_filter_attr_tokens = position_filter.tokenizer.tokenize(r_string) r_ordered_tokens = order_using_token_ordering(r_filter_attr_tokens, token_ordering) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the filter attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the filter attribute. These ltable record ids are cached # in l_empty_records list which was constructed when building the # position index. if handle_empty and len(r_ordered_tokens) == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[l_id][l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) continue candidate_overlap = position_filter.find_candidates( r_ordered_tokens, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: if has_output_attributes: output_row = get_output_row_from_tables( ltable[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[cand][l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table