def _edit_distance_join_split(ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform edit distance join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) sim_measure_type = 'EDIT_DISTANCE' # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_list, rtable_list], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # cache l_join_attr lengths l_join_attr_list = [] for row in ltable_list: l_join_attr_list.append(len(row[l_join_attr_index])) # Build prefix index on l_join_attr prefix_index = PrefixIndex(ltable_list, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) prefix_index.build(False) prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold) comp_fn = COMP_OP_MAP[comp_op] sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_len = len(r_string) r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # obtain candidates by applying prefix filter. candidates = prefix_filter.find_candidates(r_ordered_tokens, prefix_index) for cand in candidates: if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold: l_row = ltable_list[cand] # compute the actual edit distance edit_dist = sim_fn(l_row[l_join_attr_index], r_string) if comp_fn(edit_dist, threshold): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ l_row[l_key_attr_index], r_row[r_key_attr_index] ] # if out_sim_score flag is set, append the edit distance # score to the output record. if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _edit_distance_join_split(ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform edit distance join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) sim_measure_type = 'EDIT_DISTANCE' # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_list, rtable_list], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # cache l_join_attr lengths l_join_attr_list = [] for row in ltable_list: l_join_attr_list.append(len(row[l_join_attr_index])) # Build prefix index on l_join_attr prefix_index = PrefixIndex(ltable_list, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) prefix_index.build(False) prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold) comp_fn = COMP_OP_MAP[comp_op] sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_len = len(r_string) r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # obtain candidates by applying prefix filter. candidates = prefix_filter.find_candidates(r_ordered_tokens, prefix_index) for cand in candidates: if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold: l_row = ltable_list[cand] # compute the actual edit distance edit_dist = sim_fn(l_row[l_join_attr_index], r_string) if comp_fn(edit_dist, threshold): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] # if out_sim_score flag is set, append the edit distance # score to the output record. if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _overlap_coefficient_join_split(ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform overlap coefficient join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # Build inverted index over ltable inverted_index = InvertedIndex(ltable_list, l_join_attr_index, tokenizer, cache_size_flag=True) # While building the index, we cache the record ids with empty set of # tokens. This is needed to handle the allow_empty flag. cached_data = inverted_index.build(allow_empty) l_empty_records = cached_data['empty_records'] overlap_filter = OverlapFilter(tokenizer, 1) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_join_attr_tokens = tokenizer.tokenize(r_string) r_num_tokens = len(r_join_attr_tokens) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the inverted # index. if allow_empty and r_num_tokens == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable_list[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable_list[l_id][l_key_attr_index], r_row[r_key_attr_index]] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # probe inverted index and find overlap of candidates candidate_overlap = overlap_filter.find_candidates( r_join_attr_tokens, inverted_index) for cand, overlap in iteritems(candidate_overlap): # compute the actual similarity score sim_score = (float(overlap) / float(min(r_num_tokens, inverted_index.size_cache[cand]))) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable_list[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable_list[cand][l_key_attr_index], r_row[r_key_attr_index]] # if out_sim_score flag is set, append the overlap coefficient # score to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _overlap_coefficient_join_split( ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform overlap coefficient join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # Build inverted index over ltable inverted_index = InvertedIndex(ltable_list, l_join_attr_index, tokenizer, cache_size_flag=True) # While building the index, we cache the record ids with empty set of # tokens. This is needed to handle the allow_empty flag. cached_data = inverted_index.build(allow_empty) l_empty_records = cached_data['empty_records'] overlap_filter = OverlapFilter(tokenizer, 1) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_join_attr_tokens = tokenizer.tokenize(r_string) r_num_tokens = len(r_join_attr_tokens) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the inverted # index. if allow_empty and r_num_tokens == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable_list[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable_list[l_id][l_key_attr_index], r_row[r_key_attr_index] ] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # probe inverted index and find overlap of candidates candidate_overlap = overlap_filter.find_candidates( r_join_attr_tokens, inverted_index) for cand, overlap in iteritems(candidate_overlap): # compute the actual similarity score sim_score = ( float(overlap) / float(min(r_num_tokens, inverted_index.size_cache[cand]))) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable_list[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable_list[cand][l_key_attr_index], r_row[r_key_attr_index] ] # if out_sim_score flag is set, append the overlap coefficient # score to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def get_pairs_with_missing_value_disk(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, temp_dir, data_limit_per_core, missing_pairs_file_name, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=False, show_progress=True): # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # find ltable records with missing value in l_join_attr ltable_missing = ltable[pd.isnull(ltable[l_join_attr])] # find ltable records which do not contain missing value in l_join_attr ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])] # find rtable records with missing value in r_join_attr rtable_missing = rtable[pd.isnull(rtable[r_join_attr])] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: print('Finding pairs with missing value...') prog_bar = pyprind.ProgBar(len(ltable_missing) + len(rtable_missing)) # For each ltable record with missing value in l_join_attr, # output a pair corresponding to every record in rtable. for l_row in ltable_missing.itertuples(index=False): for r_row in rtable.itertuples(index=False): if has_output_attributes: record = get_output_row_from_tables(l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: record = [l_row[l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(record) # Flushing the data onto the disk if in-memory size exceeds the permissible data limit if len(output_rows) > data_limit_per_core: df = pd.DataFrame(output_rows) with open(missing_pairs_file_name, 'a+') as myfile: df.to_csv(myfile, header=False, index=False) output_rows = [] if show_progress: prog_bar.update() # if output rows have some data left, flush the same to the disk to maintain consistency. if len(output_rows) > 0: df = pd.DataFrame(output_rows) with open(missing_pairs_file_name, 'a+') as myfile: df.to_csv(myfile, header=False, index=False) output_rows = [] # For each rtable record with missing value in r_join_attr, # output a pair corresponding to every record in ltable which # doesn't have a missing value in l_join_attr. for r_row in rtable_missing.itertuples(index=False): for l_row in ltable_not_missing.itertuples(index=False): if has_output_attributes: record = get_output_row_from_tables(l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: record = [l_row[l_key_attr_index], r_row[r_key_attr_index]] if out_sim_score: record.append(np.NaN) output_rows.append(record) # Flushing the data onto the disk if in-memory size exceeds the permissible data limit if len(output_rows) > data_limit_per_core: df = pd.DataFrame(output_rows) with open(missing_pairs_file_name, 'a+') as myfile: df.to_csv(myfile, header=False, index=False) output_rows = [] if show_progress: prog_bar.update() # if output rows have some data left, flush the same to the disk to maintain consistency. if len(output_rows) > 0: df = pd.DataFrame(output_rows) with open(missing_pairs_file_name, 'a+') as myfile: df.to_csv(myfile, header=False, index=False) output_rows = [] return True
def set_sim_join(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, sim_measure_type, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform set similarity join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable, rtable], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # Build position index on l_join_attr position_index = PositionIndex(ltable, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) # While building the index, we cache the tokens and the empty records. # We cache the tokens so that we need not tokenize each string in # l_join_attr multiple times when we need to compute the similarity measure. # Further we cache the empty record ids to handle the allow_empty flag. cached_data = position_index.build(allow_empty, cache_tokens=True) l_empty_records = cached_data['empty_records'] cached_l_tokens = cached_data['cached_tokens'] pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable)) k = 0 for r_row in rtable: r_string = r_row[r_join_attr_index] # order the tokens using the token ordering. r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the position # index. if allow_empty and len(r_ordered_tokens) == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable[l_id][l_key_attr_index], r_row[r_key_attr_index] ] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # obtain candidates by applying position filter. candidate_overlap = pos_filter.find_candidates(r_ordered_tokens, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: l_ordered_tokens = cached_l_tokens[cand] k += 1 # compute the actual similarity score sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable[cand][l_key_attr_index], r_row[r_key_attr_index] ] # if out_sim_score flag is set, append the similarity score # to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() print 'k : ', k output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def get_pairs_with_missing_value(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=False, show_progress=True): # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # find ltable records with missing value in l_join_attr ltable_missing = ltable[pd.isnull(ltable[l_join_attr])] # find ltable records which do not contain missing value in l_join_attr ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])] # find rtable records with missing value in r_join_attr rtable_missing = rtable[pd.isnull(rtable[r_join_attr])] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: print('Finding pairs with missing value...') prog_bar = pyprind.ProgBar(len(ltable_missing) + len(rtable_missing)) # For each ltable record with missing value in l_join_attr, # output a pair corresponding to every record in rtable. for l_row in ltable_missing.itertuples(index=False): for r_row in rtable.itertuples(index=False): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) if show_progress: prog_bar.update() # For each rtable record with missing value in r_join_attr, # output a pair corresponding to every record in ltable which # doesn't have a missing value in l_join_attr. for r_row in rtable_missing.itertuples(index=False): for l_row in ltable_not_missing.itertuples(index=False): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] if out_sim_score: output_row.append(np.NaN) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _apply_matcher_split(candset, candset_l_key_attr, candset_r_key_attr, ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, tokenizer, sim_function, threshold, comp_op, allow_missing, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress, l_tokens, r_tokens): # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_match_attr_index = l_columns.index(l_match_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_match_attr_index = r_columns.index(r_match_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # Build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_match_attr_index, remove_null=False) # Build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_match_attr_index, remove_null=False) # Find indices of l_key_attr and r_key_attr in candset candset_columns = list(candset.columns.values) candset_l_key_attr_index = candset_columns.index(candset_l_key_attr) candset_r_key_attr_index = candset_columns.index(candset_r_key_attr) comp_fn = COMP_OP_MAP[comp_op] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) output_rows = [] if show_progress: prog_bar = pyprind.ProgBar(len(candset)) tokenize_flag = False if tokenizer is not None: tokenize_flag = True use_cache = False # check if we have cached the tokens. if l_tokens is not None and r_tokens is not None: use_cache = True for candset_row in candset.itertuples(index = False): l_id = candset_row[candset_l_key_attr_index] r_id = candset_row[candset_r_key_attr_index] l_row = ltable_dict[l_id] r_row = rtable_dict[r_id] l_apply_col_value = l_row[l_match_attr_index] r_apply_col_value = r_row[r_match_attr_index] allow_pair = False # Check if one of the inputs is missing. If yes, check the allow_missing # flag. If it is True, then add the pair to output. Else, continue. # If none of the input is missing, then proceed to apply the # sim_function. if pd.isnull(l_apply_col_value) or pd.isnull(r_apply_col_value): if allow_missing: allow_pair = True sim_score = pd.np.NaN else: continue else: if tokenize_flag: # If we have cached the tokens, we use it directly. Else, we # tokenize the values. if use_cache: l_apply_col_value = l_tokens[l_id] r_apply_col_value = r_tokens[r_id] else: l_apply_col_value = tokenizer.tokenize(l_apply_col_value) r_apply_col_value = tokenizer.tokenize(r_apply_col_value) sim_score = sim_function(l_apply_col_value, r_apply_col_value) allow_pair = comp_fn(sim_score, threshold) if allow_pair: if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) output_row.insert(0, candset_row[0]) else: output_row = [candset_row[0], l_id, r_id] if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) output_header.insert(0, '_id') if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _filter_tables_split(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, suffix_filter, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, show_progress): # find column indices of key attr, filter attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, filter attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # generate token ordering using tokens in l_filter_attr and r_filter_attr token_ordering = gen_token_ordering_for_tables( [ltable, rtable], [l_filter_attr_index, r_filter_attr_index], suffix_filter.tokenizer, suffix_filter.sim_measure_type) # ignore allow_empty flag for OVERLAP and EDIT_DISTANCE measures. handle_empty = (suffix_filter.allow_empty and suffix_filter.sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(ltable)) for l_row in ltable: l_string = l_row[l_filter_attr_index] ltokens = suffix_filter.tokenizer.tokenize(l_string) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) l_num_tokens = len(ordered_ltokens) l_prefix_length = get_prefix_length(l_num_tokens, suffix_filter.sim_measure_type, suffix_filter.threshold, suffix_filter.tokenizer) l_suffix = ordered_ltokens[l_prefix_length:] for r_row in rtable: r_string = r_row[r_filter_attr_index] rtokens = suffix_filter.tokenizer.tokenize(r_string) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) r_num_tokens = len(ordered_rtokens) # If allow_empty flag is set, then add the pair to the output. if handle_empty and l_num_tokens == 0 and r_num_tokens == 0: if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) continue r_prefix_length = get_prefix_length(r_num_tokens, suffix_filter.sim_measure_type, suffix_filter.threshold, suffix_filter.tokenizer) if l_prefix_length <= 0 or r_prefix_length <= 0: continue if not suffix_filter._filter_suffix(l_suffix, ordered_rtokens[r_prefix_length:], l_prefix_length, r_prefix_length, l_num_tokens, r_num_tokens): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def get_pairs_with_missing_value_disk(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, temp_dir, data_limit_per_core, missing_pairs_file_name, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=False, show_progress=True): # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # find ltable records with missing value in l_join_attr ltable_missing = ltable[pd.isnull(ltable[l_join_attr])] # find ltable records which do not contain missing value in l_join_attr ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])] # find rtable records with missing value in r_join_attr rtable_missing = rtable[pd.isnull(rtable[r_join_attr])] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: print('Finding pairs with missing value...') prog_bar = pyprind.ProgBar(len(ltable_missing) + len(rtable_missing)) # For each ltable record with missing value in l_join_attr, # output a pair corresponding to every record in rtable. for l_row in ltable_missing.itertuples(index=False): for r_row in rtable.itertuples(index=False): if has_output_attributes: record = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: record = [l_row[l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(record) # Flushing the data onto the disk if in-memory size exceeds the permissible data limit if len(output_rows) > data_limit_per_core: df = pd.DataFrame(output_rows) with open(missing_pairs_file_name, 'a+') as myfile: df.to_csv(myfile, header=False, index=False) output_rows = [] if show_progress: prog_bar.update() # if output rows have some data left, flush the same to the disk to maintain consistency. if len(output_rows) > 0: df = pd.DataFrame(output_rows) with open(missing_pairs_file_name, 'a+') as myfile: df.to_csv(myfile, header=False, index=False) output_rows = [] # For each rtable record with missing value in r_join_attr, # output a pair corresponding to every record in ltable which # doesn't have a missing value in l_join_attr. for r_row in rtable_missing.itertuples(index=False): for l_row in ltable_not_missing.itertuples(index=False): if has_output_attributes: record = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: record = [l_row[l_key_attr_index], r_row[r_key_attr_index]] if out_sim_score: record.append(pd.np.NaN) output_rows.append(record) # Flushing the data onto the disk if in-memory size exceeds the permissible data limit if len(output_rows) > data_limit_per_core: df = pd.DataFrame(output_rows) with open(missing_pairs_file_name, 'a+') as myfile: df.to_csv(myfile, header=False, index=False) output_rows = [] if show_progress: prog_bar.update() # if output rows have some data left, flush the same to the disk to maintain consistency. if len(output_rows) > 0: df = pd.DataFrame(output_rows) with open(missing_pairs_file_name, 'a+') as myfile: df.to_csv(myfile, header=False, index=False) output_rows = [] return True
def _filter_tables_split(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, size_filter, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, show_progress): # find column indices of key attr, filter attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = [] l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, filter attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # ignore allow_empty flag for OVERLAP and EDIT_DISTANCE measures. handle_empty = (size_filter.allow_empty and size_filter.sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']) # Build size index over ltable size_index = SizeIndex(ltable, l_filter_attr_index, size_filter.tokenizer) # While building the index, we cache the record ids with empty set of # tokens. This is needed to handle the allow_empty flag. cached_data = size_index.build(handle_empty) l_empty_records = cached_data['empty_records'] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable)) for r_row in rtable: r_string = r_row[r_filter_attr_index] r_num_tokens = len(size_filter.tokenizer.tokenize(r_string)) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the filter attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the filter attribute. These ltable record ids are cached # in l_empty_records list which was constructed when building the size # index. if handle_empty and r_num_tokens == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[l_id][l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) continue # probe size index and find candidates candidates = size_filter.find_candidates(r_num_tokens, size_index) for cand in candidates: if has_output_attributes: output_row = get_output_row_from_tables( ltable[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[cand][l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _filter_tables_split(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, size_filter, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, show_progress): # find column indices of key attr, filter attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = [] l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, filter attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # ignore allow_empty flag for OVERLAP and EDIT_DISTANCE measures. handle_empty = (size_filter.allow_empty and size_filter.sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']) # Build size index over ltable size_index = SizeIndex(ltable, l_filter_attr_index, size_filter.tokenizer) # While building the index, we cache the record ids with empty set of # tokens. This is needed to handle the allow_empty flag. cached_data = size_index.build(handle_empty) l_empty_records = cached_data['empty_records'] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable)) for r_row in rtable: r_string = r_row[r_filter_attr_index] r_num_tokens = len(size_filter.tokenizer.tokenize(r_string)) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the filter attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the filter attribute. These ltable record ids are cached # in l_empty_records list which was constructed when building the size # index. if handle_empty and r_num_tokens == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable[l_id][l_key_attr_index], r_row[r_key_attr_index] ] output_rows.append(output_row) continue # probe size index and find candidates candidates = size_filter.find_candidates(r_num_tokens, size_index) for cand in candidates: if has_output_attributes: output_row = get_output_row_from_tables( ltable[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable[cand][l_key_attr_index], r_row[r_key_attr_index] ] output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def get_pairs_with_missing_value(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=False, show_progress=True): # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # find ltable records with missing value in l_join_attr ltable_missing = ltable[pd.isnull(ltable[l_join_attr])] # find ltable records which do not contain missing value in l_join_attr ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])] # find rtable records with missing value in r_join_attr rtable_missing = rtable[pd.isnull(rtable[r_join_attr])] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: print('Finding pairs with missing value...') prog_bar = pyprind.ProgBar(len(ltable_missing) + len(rtable_missing)) # For each ltable record with missing value in l_join_attr, # output a pair corresponding to every record in rtable. for l_row in ltable_missing.itertuples(index=False): for r_row in rtable.itertuples(index=False): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) if show_progress: prog_bar.update() # For each rtable record with missing value in r_join_attr, # output a pair corresponding to every record in ltable which # doesn't have a missing value in l_join_attr. for r_row in rtable_missing.itertuples(index=False): for l_row in ltable_not_missing.itertuples(index=False): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] if out_sim_score: output_row.append(pd.np.NaN) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _filter_tables_split(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, overlap_filter, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): # Find column indices of key attr, filter attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = [] l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # Find column indices of key attr, filter attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # Build inverted index over ltable inverted_index = InvertedIndex(ltable, l_filter_attr_index, overlap_filter.tokenizer) inverted_index.build(False) comp_fn = COMP_OP_MAP[overlap_filter.comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable)) for r_row in rtable: r_string = r_row[r_filter_attr_index] r_filter_attr_tokens = overlap_filter.tokenizer.tokenize(r_string) # probe inverted index and find overlap of candidates candidate_overlap = overlap_filter.find_candidates( r_filter_attr_tokens, inverted_index) for cand, overlap in iteritems(candidate_overlap): if comp_fn(overlap, overlap_filter.overlap_size): if has_output_attributes: output_row = get_output_row_from_tables( ltable[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[cand][l_key_attr_index], r_row[r_key_attr_index]] if out_sim_score: output_row.append(overlap) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def set_sim_join(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, sim_measure_type, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform set similarity join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable, rtable], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # Build position index on l_join_attr position_index = PositionIndex(ltable, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) # While building the index, we cache the tokens and the empty records. # We cache the tokens so that we need not tokenize each string in # l_join_attr multiple times when we need to compute the similarity measure. # Further we cache the empty record ids to handle the allow_empty flag. cached_data = position_index.build(allow_empty, cache_tokens=True) l_empty_records = cached_data['empty_records'] cached_l_tokens = cached_data['cached_tokens'] pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable)) k = 0 for r_row in rtable: r_string = r_row[r_join_attr_index] # order the tokens using the token ordering. r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the position # index. if allow_empty and len(r_ordered_tokens) == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[l_id][l_key_attr_index], r_row[r_key_attr_index]] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # obtain candidates by applying position filter. candidate_overlap = pos_filter.find_candidates(r_ordered_tokens, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: l_ordered_tokens = cached_l_tokens[cand] k += 1 # compute the actual similarity score sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[cand][l_key_attr_index], r_row[r_key_attr_index]] # if out_sim_score flag is set, append the similarity score # to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() print 'k : ', k output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _apply_matcher_split(candset, candset_l_key_attr, candset_r_key_attr, ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, tokenizer, sim_function, threshold, comp_op, allow_missing, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress, l_tokens, r_tokens): # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_match_attr_index = l_columns.index(l_match_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_match_attr_index = r_columns.index(r_match_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # Build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_match_attr_index, remove_null=False) # Build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_match_attr_index, remove_null=False) # Find indices of l_key_attr and r_key_attr in candset candset_columns = list(candset.columns.values) candset_l_key_attr_index = candset_columns.index(candset_l_key_attr) candset_r_key_attr_index = candset_columns.index(candset_r_key_attr) comp_fn = COMP_OP_MAP[comp_op] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) output_rows = [] if show_progress: prog_bar = pyprind.ProgBar(len(candset)) tokenize_flag = False if tokenizer is not None: tokenize_flag = True use_cache = False # check if we have cached the tokens. if l_tokens is not None and r_tokens is not None: use_cache = True for candset_row in candset.itertuples(index=False): l_id = candset_row[candset_l_key_attr_index] r_id = candset_row[candset_r_key_attr_index] l_row = ltable_dict[l_id] r_row = rtable_dict[r_id] l_apply_col_value = l_row[l_match_attr_index] r_apply_col_value = r_row[r_match_attr_index] allow_pair = False # Check if one of the inputs is missing. If yes, check the allow_missing # flag. If it is True, then add the pair to output. Else, continue. # If none of the input is missing, then proceed to apply the # sim_function. if pd.isnull(l_apply_col_value) or pd.isnull(r_apply_col_value): if allow_missing: allow_pair = True sim_score = pd.np.NaN else: continue else: if tokenize_flag: # If we have cached the tokens, we use it directly. Else, we # tokenize the values. if use_cache: l_apply_col_value = l_tokens[l_id] r_apply_col_value = r_tokens[r_id] else: l_apply_col_value = tokenizer.tokenize(l_apply_col_value) r_apply_col_value = tokenizer.tokenize(r_apply_col_value) sim_score = sim_function(l_apply_col_value, r_apply_col_value) allow_pair = comp_fn(sim_score, threshold) if allow_pair: if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) output_row.insert(0, candset_row[0]) else: output_row = [candset_row[0], l_id, r_id] if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) output_header.insert(0, '_id') if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _filter_tables_split(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, position_filter, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, show_progress): # find column indices of key attr, filter attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = [] l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, filter attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # generate token ordering using tokens in l_filter_attr and r_filter_attr token_ordering = gen_token_ordering_for_tables( [ltable, rtable], [l_filter_attr_index, r_filter_attr_index], position_filter.tokenizer, position_filter.sim_measure_type) # ignore allow_empty flag for OVERLAP and EDIT_DISTANCE measures. handle_empty = (position_filter.allow_empty and position_filter.sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']) # Build position index on l_filter_attr position_index = PositionIndex(ltable, l_filter_attr_index, position_filter.tokenizer, position_filter.sim_measure_type, position_filter.threshold, token_ordering) # While building the index, we cache the record ids with empty set of # tokens. This is needed to handle the allow_empty flag. cached_data = position_index.build(handle_empty) l_empty_records = cached_data['empty_records'] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable)) for r_row in rtable: r_string = r_row[r_filter_attr_index] r_filter_attr_tokens = position_filter.tokenizer.tokenize(r_string) r_ordered_tokens = order_using_token_ordering(r_filter_attr_tokens, token_ordering) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the filter attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the filter attribute. These ltable record ids are cached # in l_empty_records list which was constructed when building the # position index. if handle_empty and len(r_ordered_tokens) == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[l_id][l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) continue candidate_overlap = position_filter.find_candidates( r_ordered_tokens, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: if has_output_attributes: output_row = get_output_row_from_tables( ltable[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[cand][l_key_attr_index], r_row[r_key_attr_index]] output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table