def filter_pair(self, lstring, rstring): """Filter two strings with suffix filter. Args: lstring, rstring : input strings Returns: result : boolean, True if the tuple pair is dropped. """ # check for empty string if (not lstring) or (not rstring): return True ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type) rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type) token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) l_num_tokens = len(ordered_ltokens) r_num_tokens = len(ordered_rtokens) l_prefix_length = get_prefix_length(l_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) r_prefix_length = get_prefix_length(r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) return self._filter_suffix(ordered_ltokens[l_prefix_length:], ordered_rtokens[r_prefix_length:], l_prefix_length, r_prefix_length, len(ltokens), len(rtokens))
def filter_pair(self, lstring, rstring): """Filter two strings with prefix filter. Args: lstring, rstring : input strings Returns: result : boolean, True if the tuple pair is dropped. """ # check for empty string if (not lstring) or (not rstring): return True ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type) rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type) token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) l_prefix_length = get_prefix_length(len(ordered_ltokens), self.sim_measure_type, self.threshold, self.tokenizer) r_prefix_length = get_prefix_length(len(ordered_rtokens), self.sim_measure_type, self.threshold, self.tokenizer) prefix_overlap = set(ordered_ltokens[0:l_prefix_length]).intersection( set(ordered_rtokens[0:r_prefix_length])) if len(prefix_overlap) > 0: return False else: return True
def filter_pair(self, lstring, rstring): """Filter two strings with position filter. Args: lstring, rstring : input strings Returns: result : boolean, True if the tuple pair is dropped. """ # check for empty string if (not lstring) or (not rstring): return True ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type) rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type) token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) l_num_tokens = len(ordered_ltokens) r_num_tokens = len(ordered_rtokens) l_prefix_length = get_prefix_length(l_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) r_prefix_length = get_prefix_length(r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) l_prefix_dict = {} l_pos = 0 for token in ordered_ltokens[0:l_prefix_length]: l_prefix_dict[token] = l_pos overlap_threshold = get_overlap_threshold(l_num_tokens, r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) current_overlap = 0 r_pos = 0 for token in ordered_rtokens[0:r_prefix_length]: l_pos = l_prefix_dict.get(token) if l_pos is not None: overlap_upper_bound = 1 + min(l_num_tokens - l_pos - 1, r_num_tokens - r_pos - 1) if (current_overlap + overlap_upper_bound) < overlap_threshold: return True current_overlap += 1 r_pos += 1 if current_overlap > 0: return False return True
def filter_pair(self, lstring, rstring): """Checks if the input strings get dropped by the suffix filter. Args: lstring,rstring (string): input strings Returns: A flag indicating whether the string pair is dropped (boolean). """ # If one of the inputs is missing, then check the allow_missing flag. # If it is set to True, then pass the pair. Else drop the pair. if pd.isnull(lstring) or pd.isnull(rstring): return (not self.allow_missing) # tokenize input strings ltokens = self.tokenizer.tokenize(lstring) rtokens = self.tokenizer.tokenize(rstring) l_num_tokens = len(ltokens) r_num_tokens = len(rtokens) if l_num_tokens == 0 and r_num_tokens == 0: if self.sim_measure_type == 'OVERLAP': return True elif self.sim_measure_type == 'EDIT_DISTANCE': return False else: return (not self.allow_empty) # order the tokens using the token ordering token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) # compute prefix length l_prefix_length = get_prefix_length(l_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) r_prefix_length = get_prefix_length(r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) if l_prefix_length <= 0 or r_prefix_length <= 0: return True return self._filter_suffix(ordered_ltokens[l_prefix_length:], ordered_rtokens[r_prefix_length:], l_prefix_length, r_prefix_length, l_num_tokens, r_num_tokens)
def find_candidates(self, probe_tokens, position_index): # probe position index to find candidates for the input probe tokens. if not position_index.index: return {} probe_num_tokens = len(probe_tokens) size_lower_bound = max(get_size_lower_bound(probe_num_tokens, self.sim_measure_type, self.threshold), position_index.min_length) size_upper_bound = min(get_size_upper_bound(probe_num_tokens, self.sim_measure_type, self.threshold), position_index.max_length) # cache overlap threshold lower bound values to avoid recomputing them # multiple times when probing the position index. overlap_threshold_cache = {} for size in xrange(size_lower_bound, size_upper_bound + 1): overlap_threshold_cache[size] = get_overlap_threshold( size, probe_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) probe_prefix_length = get_prefix_length(probe_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) # probe position index and find candidates candidate_overlap = {} probe_pos = 0 for token in probe_tokens[0:probe_prefix_length]: for (cand, cand_pos) in position_index.probe(token): current_overlap = candidate_overlap.get(cand, 0) if current_overlap != -1: cand_num_tokens = position_index.size_cache[cand] # only consider candidates satisfying the size filter # condition. if size_lower_bound <= cand_num_tokens <= size_upper_bound: if (probe_num_tokens - probe_pos <= cand_num_tokens - cand_pos): overlap_upper_bound = probe_num_tokens - probe_pos else: overlap_upper_bound = cand_num_tokens - cand_pos # only consider candidates for which the overlap upper # bound is at least the required overlap. if (current_overlap + overlap_upper_bound >= overlap_threshold_cache[cand_num_tokens]): candidate_overlap[cand] = current_overlap + 1 else: candidate_overlap[cand] = -1 probe_pos += 1 return candidate_overlap
def build(self, cache_empty_records=True): """Build prefix index.""" self.index = {} empty_records = [] row_id = 0 for row in self.table: # tokenize string and order the tokens using the token ordering index_string = row[self.index_attr] index_attr_tokens = order_using_token_ordering( self.tokenizer.tokenize(index_string), self.token_ordering) # compute prefix length num_tokens = len(index_attr_tokens) prefix_length = get_prefix_length( num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) # update index for token in index_attr_tokens[0:prefix_length]: if self.index.get(token) is None: self.index[token] = [] self.index.get(token).append(row_id) if cache_empty_records and num_tokens == 0: empty_records.append(row_id) row_id += 1 return {'empty_records': empty_records}
def build(self, cache_empty_records=True): """Build prefix index.""" self.index = {} empty_records = [] row_id = 0 for row in self.table: # tokenize string and order the tokens using the token ordering index_string = row[self.index_attr] index_attr_tokens = order_using_token_ordering( self.tokenizer.tokenize(index_string), self.token_ordering) # compute prefix length num_tokens = len(index_attr_tokens) prefix_length = get_prefix_length(num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) # update index for token in index_attr_tokens[0:prefix_length]: if self.index.get(token) is None: self.index[token] = [] self.index.get(token).append(row_id) if cache_empty_records and num_tokens == 0: empty_records.append(row_id) row_id += 1 return {'empty_records': empty_records}
def build(self): for row in self.table: index_string = str(row[self.index_attr]) # check for empty string if not index_string: continue index_attr_tokens = order_using_token_ordering(tokenize( index_string, self.tokenizer, self.sim_measure_type), self.token_ordering) num_tokens = len(index_attr_tokens) prefix_length = get_prefix_length( num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) row_id = row[self.key_attr] pos = 0 for token in index_attr_tokens[0:prefix_length]: if self.index.get(token) is None: self.index[token] = [] self.index.get(token).append((row_id, pos)) pos += 1 self.size_map[row_id] = num_tokens return True
def _find_candidates(tokens, num_tokens, prefix_filter, prefix_index): prefix_length = get_prefix_length(num_tokens, prefix_filter.sim_measure_type, prefix_filter.threshold, prefix_filter.tokenizer) candidates = set() for token in tokens[0:prefix_length]: for cand in prefix_index.probe(token): candidates.add(cand) return candidates
def find_candidates(self, probe_tokens, prefix_index): # probe prefix index to find candidates for the input probe tokens. if not prefix_index.index: return set() probe_num_tokens = len(probe_tokens) probe_prefix_length = get_prefix_length(probe_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) candidates = set() for token in probe_tokens[0:probe_prefix_length]: candidates.update(prefix_index.probe(token)) return candidates
def build(self, cache_empty_records=True, cache_tokens=False): """Build position index.""" self.index = {} self.size_cache = [] cached_tokens = [] empty_records = [] row_id = 0 for row in self.table: # tokenize string and order the tokens using the token ordering index_string = row[self.index_attr] index_attr_tokens = order_using_token_ordering( self.tokenizer.tokenize(index_string), self.token_ordering) # compute prefix length num_tokens = len(index_attr_tokens) prefix_length = get_prefix_length( num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) # update the index pos = 0 for token in index_attr_tokens[0:prefix_length]: if self.index.get(token) is None: self.index[token] = [] self.index.get(token).append((row_id, pos)) pos += 1 self.size_cache.append(num_tokens) # keep track of the max size and min size. if num_tokens < self.min_length: self.min_length = num_tokens if num_tokens > self.max_length: self.max_length = num_tokens # if cache_tokens flag is set to True, the store the tokens. if cache_tokens: cached_tokens.append(index_attr_tokens) if cache_empty_records and num_tokens == 0: empty_records.append(row_id) row_id += 1 return {'cached_tokens' : cached_tokens, 'empty_records' : empty_records}
def find_candidates(self, probe_tokens, prefix_index): # probe prefix index to find candidates for the input probe tokens. if not prefix_index.index: return set() probe_num_tokens = len(probe_tokens) probe_prefix_length = get_prefix_length(probe_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) candidates = set() for token in probe_tokens[0:probe_prefix_length]: candidates.update(prefix_index.probe(token)) return candidates
def filter_pair(self, lstring, rstring): """Checks if the input strings get dropped by the position filter. Args: lstring,rstring (string): input strings Returns: A flag indicating whether the string pair is dropped (boolean). """ # If one of the inputs is missing, then check the allow_missing flag. # If it is set to True, then pass the pair. Else drop the pair. if pd.isnull(lstring) or pd.isnull(rstring): return (not self.allow_missing) # tokenize input strings ltokens = self.tokenizer.tokenize(lstring) rtokens = self.tokenizer.tokenize(rstring) l_num_tokens = len(ltokens) r_num_tokens = len(rtokens) if l_num_tokens == 0 and r_num_tokens == 0: if self.sim_measure_type == 'OVERLAP': return True elif self.sim_measure_type == 'EDIT_DISTANCE': return False else: return (not self.allow_empty) token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) l_prefix_length = get_prefix_length(l_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) r_prefix_length = get_prefix_length(r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) if l_prefix_length <= 0 or r_prefix_length <= 0: return True l_prefix_dict = {} l_pos = 0 for token in ordered_ltokens[0:l_prefix_length]: l_prefix_dict[token] = l_pos overlap_threshold = get_overlap_threshold(l_num_tokens, r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) current_overlap = 0 r_pos = 0 for token in ordered_rtokens[0:r_prefix_length]: l_pos = l_prefix_dict.get(token) if l_pos is not None: overlap_upper_bound = 1 + min(l_num_tokens - l_pos - 1, r_num_tokens - r_pos - 1) if (current_overlap + overlap_upper_bound) < overlap_threshold: return True current_overlap += 1 r_pos += 1 if current_overlap > 0: return False return True
def filter_tables(self, ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_'): """Filter tables with suffix filter. Args: ltable, rtable : Pandas data frame l_key_attr, r_key_attr : String, key attribute from ltable and rtable l_filter_attr, r_filter_attr : String, filter attribute from ltable and rtable l_out_attrs, r_out_attrs : list of attribtues to be included in the output table from ltable and rtable l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table Returns: result : Pandas data frame """ # check if the input tables are dataframes validate_input_table(ltable, 'left table') validate_input_table(rtable, 'right table') # check if the key attributes and filter attributes exist validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table') validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table') validate_attr(l_filter_attr, ltable.columns, 'filter attribute', 'left table') validate_attr(r_filter_attr, rtable.columns, 'filter attribute', 'right table') # check if the output attributes exist validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs, rtable.columns) # check if the key attributes are unique and do not contain missing values validate_key_attr(l_key_attr, ltable, 'left table') validate_key_attr(r_key_attr, rtable, 'right table') # find column indices of key attr, filter attr and # output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = find_output_attribute_indices( l_columns, l_out_attrs) # find column indices of key attr, filter attr and # output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices( r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_filter_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_filter_attr_index) # generate token ordering using tokens in l_filter_attr # and r_filter_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_filter_attr_index, r_filter_attr_index], self.tokenizer, self.sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(ltable)) for l_row in ltable_dict.values(): l_id = l_row[l_key_attr_index] l_string = str(l_row[l_filter_attr_index]) # check for empty string if not l_string: continue ltokens = tokenize(l_string, self.tokenizer, self.sim_measure_type) ordered_ltokens = order_using_token_ordering( ltokens, token_ordering) l_num_tokens = len(ordered_ltokens) l_prefix_length = get_prefix_length(l_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) l_suffix = ordered_ltokens[l_prefix_length:] for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_filter_attr_index]) # check for empty string if not r_string: continue rtokens = tokenize(r_string, self.tokenizer, self.sim_measure_type) ordered_rtokens = order_using_token_ordering( rtokens, token_ordering) r_num_tokens = len(ordered_rtokens) r_prefix_length = get_prefix_length(r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) if not self._filter_suffix( l_suffix, ordered_rtokens[r_prefix_length:], l_prefix_length, r_prefix_length, l_num_tokens, r_num_tokens): if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[l_id], r_row, l_id, r_id, l_out_attrs_indices, r_out_attrs_indices) output_rows.append(output_row) else: output_rows.append([l_id, r_id]) prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) output_table.insert(0, '_id', range(0, len(output_table))) return output_table
def _filter_tables_split(ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, position_filter, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix): # find column indices of key attr, filter attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = [] l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, filter attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_filter_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_filter_attr_index) # generate token ordering using tokens in l_filter_attr and r_filter_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_filter_attr_index, r_filter_attr_index], position_filter.tokenizer, position_filter.sim_measure_type) # Build position index on l_filter_attr position_index = PositionIndex(ltable_dict.values(), l_key_attr_index, l_filter_attr_index, position_filter.tokenizer, position_filter.sim_measure_type, position_filter.threshold, token_ordering) position_index.build() output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(rtable)) for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_filter_attr_index]) # check for empty string if not r_string: continue r_filter_attr_tokens = tokenize(r_string, position_filter.tokenizer, position_filter.sim_measure_type) r_ordered_tokens = order_using_token_ordering(r_filter_attr_tokens, token_ordering) r_num_tokens = len(r_ordered_tokens) r_prefix_length = get_prefix_length(r_num_tokens, position_filter.sim_measure_type, position_filter.threshold, position_filter.tokenizer) candidate_overlap = _find_candidates(r_ordered_tokens, r_num_tokens, r_prefix_length, position_filter, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices, r_out_attrs_indices) output_rows.append(output_row) else: output_rows.append([cand, r_id]) prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _set_sim_join_split(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, sim_measure_type, threshold, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score): """Perform set similarity join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_join_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_join_attr_index) # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # build a dictionary of tokenized l_join_attr l_join_attr_dict = {} for row in ltable_dict.values(): l_join_attr_dict[row[l_key_attr_index]] = order_using_token_ordering( tokenize(str(row[l_join_attr_index]), tokenizer, sim_measure_type), token_ordering) # Build position index on l_join_attr position_index = PositionIndex(ltable_dict.values(), l_key_attr_index, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) position_index.build() pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold) suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(rtable_dict.keys())) for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_join_attr_index]) # check for empty string if not r_string: continue r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type) r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens, token_ordering) r_num_tokens = len(r_ordered_tokens) r_prefix_length = get_prefix_length(r_num_tokens, sim_measure_type, threshold, tokenizer) candidate_overlap = find_candidates_position_filter( r_ordered_tokens, r_num_tokens, r_prefix_length, pos_filter, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: l_ordered_tokens = l_join_attr_dict[cand] l_num_tokens = position_index.get_size(cand) l_prefix_length = get_prefix_length( l_num_tokens, sim_measure_type, threshold, tokenizer) if not suffix_filter._filter_suffix( l_ordered_tokens[l_prefix_length:], r_ordered_tokens[r_prefix_length:], l_prefix_length, r_prefix_length, l_num_tokens, r_num_tokens): sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens) if sim_score >= threshold: if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices, r_out_attrs_indices) if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) else: output_row = [cand, r_id] if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _filter_tables_split(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, suffix_filter, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, show_progress): # find column indices of key attr, filter attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, filter attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # generate token ordering using tokens in l_filter_attr and r_filter_attr token_ordering = gen_token_ordering_for_tables( [ltable, rtable], [l_filter_attr_index, r_filter_attr_index], suffix_filter.tokenizer, suffix_filter.sim_measure_type) # ignore allow_empty flag for OVERLAP and EDIT_DISTANCE measures. handle_empty = (suffix_filter.allow_empty and suffix_filter.sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(ltable)) for l_row in ltable: l_string = l_row[l_filter_attr_index] ltokens = suffix_filter.tokenizer.tokenize(l_string) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) l_num_tokens = len(ordered_ltokens) l_prefix_length = get_prefix_length(l_num_tokens, suffix_filter.sim_measure_type, suffix_filter.threshold, suffix_filter.tokenizer) l_suffix = ordered_ltokens[l_prefix_length:] for r_row in rtable: r_string = r_row[r_filter_attr_index] rtokens = suffix_filter.tokenizer.tokenize(r_string) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) r_num_tokens = len(ordered_rtokens) # If allow_empty flag is set, then add the pair to the output. if handle_empty and l_num_tokens == 0 and r_num_tokens == 0: if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) continue r_prefix_length = get_prefix_length(r_num_tokens, suffix_filter.sim_measure_type, suffix_filter.threshold, suffix_filter.tokenizer) if l_prefix_length <= 0 or r_prefix_length <= 0: continue if not suffix_filter._filter_suffix(l_suffix, ordered_rtokens[r_prefix_length:], l_prefix_length, r_prefix_length, l_num_tokens, r_num_tokens): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table