def get_features(sim_measures=None, tokenizers=None):
    features = []
    ws_tok = WhitespaceTokenizer(return_set=True)
    if sim_measures is None:
        sim_measures = [
            'JACCARD',
            'COSINE',
            'DICE',
            #                        'LEFT_LENGTH', 'RIGHT_LENGTH', 'LENGTH_SUM', 'LENGTH_DIFF']
            'OVERLAP_COEFFICIENT',
            'EDIT_DISTANCE',
            'LEFT_LENGTH',
            'RIGHT_LENGTH',
            'LENGTH_SUM',
            'LENGTH_DIFF'
        ]
    if tokenizers is None:
        tokenizers = {
            'alph': AlphabeticTokenizer(return_set=True),
            'alph_num': AlphanumericTokenizer(return_set=True),
            'num': NumericTokenizer(return_set=True),
            'ws': WhitespaceTokenizer(return_set=True),
            'qg2': QgramTokenizer(qval=2, return_set=True),
            'qg3': QgramTokenizer(qval=3, return_set=True)
        }
    for sim_measure_type in sim_measures:
        if sim_measure_type in [
                'EDIT_DISTANCE', 'LEFT_LENGTH', 'RIGHT_LENGTH', 'LENGTH_SUM',
                'LENGTH_DIFF'
        ]:
            features.append(
                (sim_measure_type.lower(), 'none', sim_measure_type, None,
                 get_sim_function(sim_measure_type)))
            continue
        for tok_name in tokenizers.keys():
            #            if sim_measure_type == 'COSINE' and tok_name == 'qg3':
            #                continue
            features.append((sim_measure_type.lower() + '_' + tok_name,
                             tok_name, sim_measure_type, tokenizers[tok_name],
                             get_sim_function(sim_measure_type)))

    feature_table_header = [
        'feature_name', 'tokenizer_type', 'sim_measure_type', 'tokenizer',
        'sim_function'
    ]
    feature_table = pd.DataFrame(features, columns=feature_table_header)
    feature_table = feature_table.set_index('feature_name')

    return feature_table
 def test_invalid_candset(self):
     tok = QgramTokenizer(qval=2, return_set=True)
     sim_func = get_sim_function('JACCARD')
     threshold = 0.3
     apply_matcher([], DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                   DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable,
                   self.rtable, self.l_key_attr, self.r_key_attr,
                   self.l_join_attr, self.r_join_attr, tok, sim_func,
                   threshold)
 def test_invalid_tokenizer(self):
     sim_func = get_sim_function('JACCARD')
     threshold = 0.3
     apply_matcher(pd.DataFrame([], columns=['_id', 'l_A.ID', 'r_B.ID']),
                   DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                   DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable,
                   self.rtable, self.l_key_attr, self.r_key_attr,
                   self.l_join_attr, self.r_join_attr, sim_func, sim_func,
                   threshold)
 def test_invalid_rtable(self):
     tok = QgramTokenizer(qval=2, return_set=True)
     sim_func = get_sim_function('JACCARD')
     threshold = 0.3
     apply_matcher(pd.DataFrame([], columns=['_id', 'l_A.ID', 'r_B.ID']),
                   DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                   DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, [],
                   self.l_key_attr, self.r_key_attr, self.l_join_attr,
                   self.r_join_attr, tok, sim_func, threshold)
    def test_apply_matcher_with_join_attr_of_type_int(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='
        l_join_attr = 'A.zipcode'
        r_join_attr = 'B.zipcode'

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(
            lambda row: sim_func(tok.tokenize(str(row[l_join_attr])),
                                 tok.tokenize(str(row[r_join_attr]))),
            axis=1)

        comp_fn = COMP_OP_MAP[comp_op]
        # compute expected output pairs
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join(
                    (str(row[self.l_key_attr]), str(row[self.r_key_attr]))))

        # use overlap filter to obtain a candset.
        overlap_filter = OverlapFilter(tok, 1, comp_op)
        candset = overlap_filter.filter_tables(self.ltable, self.rtable,
                                               self.l_key_attr,
                                               self.r_key_attr, l_join_attr,
                                               r_join_attr)

        # apply a jaccard matcher to the candset
        output_candset = apply_matcher(
            candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr,
            DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable,
            self.l_key_attr, self.r_key_attr, l_join_attr, r_join_attr, tok,
            sim_func, threshold)

        expected_output_attrs = [
            '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr,
            DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score'
        ]

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join(
                (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                 str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
 def test_invalid_tokenizer(self):
     sim_func = get_sim_function('JACCARD')
     threshold = 0.3
     apply_matcher(pd.DataFrame([], columns=['_id', 'l_A.ID', 'r_B.ID']),
         DEFAULT_L_OUT_PREFIX+self.l_key_attr,
         DEFAULT_R_OUT_PREFIX+self.r_key_attr,
         self.ltable, self.rtable,
         self.l_key_attr, self.r_key_attr,
         self.l_join_attr, self.r_join_attr,
         sim_func, sim_func, threshold)
    def test_apply_matcher(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(lambda row: sim_func(
                tok.tokenize(str(row[self.l_join_attr])),
                tok.tokenize(str(row[self.r_join_attr]))),
            axis=1)

        comp_fn = COMP_OP_MAP[comp_op]
        # compute expected output pairs
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join((str(row[self.l_key_attr]),
                                             str(row[self.r_key_attr]))))

        # use overlap filter to obtain a candset.
        overlap_filter = OverlapFilter(tok, 1, comp_op)
        candset = overlap_filter.filter_tables(self.ltable, self.rtable,
                              self.l_key_attr, self.r_key_attr,
                              self.l_join_attr, self.r_join_attr)

        # apply a jaccard matcher to the candset
        output_candset = apply_matcher(candset,
            DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr,
            self.ltable, self.rtable, self.l_key_attr, self.r_key_attr,
            self.l_join_attr, self.r_join_attr, tok, sim_func, threshold,
            comp_op, False,
            [self.l_join_attr], [self.r_join_attr], out_sim_score=True)

        expected_output_attrs=['_id',
                               DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                               DEFAULT_R_OUT_PREFIX + self.r_key_attr,
                               DEFAULT_L_OUT_PREFIX + self.l_join_attr,
                               DEFAULT_R_OUT_PREFIX + self.r_join_attr,
                               '_sim_score']

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                                       str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
 def test_invalid_candset(self):
     tok = QgramTokenizer(qval=2, return_set=True)
     sim_func = get_sim_function('JACCARD')
     threshold = 0.3
     apply_matcher([],
         DEFAULT_L_OUT_PREFIX+self.l_key_attr,
         DEFAULT_R_OUT_PREFIX+self.r_key_attr,
         self.ltable, self.rtable,
         self.l_key_attr, self.r_key_attr,
         self.l_join_attr, self.r_join_attr,
         tok, sim_func, threshold)
 def test_invalid_r_out_attr(self):
     tok = QgramTokenizer(qval=2, return_set=True)
     sim_func = get_sim_function('JACCARD')
     threshold = 0.3
     apply_matcher(pd.DataFrame([], columns=['_id', 'l_A.ID', 'r_B.ID']),
         DEFAULT_L_OUT_PREFIX+self.l_key_attr,
         DEFAULT_R_OUT_PREFIX+self.r_key_attr,
         self.ltable, self.rtable,
         self.l_key_attr, self.r_key_attr,
         self.l_join_attr, self.r_join_attr,
         tok, sim_func, threshold, r_out_attrs=['invalid_attr'])
 def test_empty_candset(self):
     tok = QgramTokenizer(qval=2, return_set=True)
     sim_func = get_sim_function('JACCARD')
     threshold = 0.3
     empty_candset = pd.DataFrame(columns=[
         DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX +
         self.r_key_attr
     ])
     apply_matcher(empty_candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                   DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable,
                   self.rtable, self.l_key_attr, self.r_key_attr,
                   self.l_join_attr, self.r_join_attr, tok, sim_func,
                   threshold)
 def test_empty_candset(self):
     tok = QgramTokenizer(qval=2, return_set=True)
     sim_func = get_sim_function('JACCARD')
     threshold = 0.3
     empty_candset = pd.DataFrame(
                     columns=[DEFAULT_L_OUT_PREFIX+self.l_key_attr,
                              DEFAULT_R_OUT_PREFIX+self.r_key_attr])
     apply_matcher(empty_candset,
         DEFAULT_L_OUT_PREFIX+self.l_key_attr,
         DEFAULT_R_OUT_PREFIX+self.r_key_attr,
         self.ltable, self.rtable,
         self.l_key_attr, self.r_key_attr,
         self.l_join_attr, self.r_join_attr,
         tok, sim_func, threshold)
Beispiel #12
0
def _edit_dist_join_split(ltable, rtable,
                          l_key_attr, r_key_attr,
                          l_join_attr, r_join_attr,
                          tokenizer,
                          threshold,
                          l_out_attrs, r_out_attrs,
                          l_out_prefix, r_out_prefix,
                          out_sim_score):
    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_join_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_join_attr_index)

    sim_measure_type = 'EDIT_DISTANCE'
    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable_dict.values(),
                          rtable_dict.values()],
                         [l_join_attr_index,
                          r_join_attr_index],
                         tokenizer, sim_measure_type)

    # build a dictionary of l_join_attr lengths
    l_join_attr_dict = {}
    for row in ltable_dict.values():
        l_join_attr_dict[row[l_key_attr_index]] = len(str(
                                                      row[l_join_attr_index]))

    # Build prefix index on l_join_attr
    prefix_index = PrefixIndex(ltable_dict.values(),
                               l_key_attr_index, l_join_attr_index,
                               tokenizer, sim_measure_type, threshold,
                               token_ordering)
    prefix_index.build()

    prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold)
    sim_fn = get_sim_function(sim_measure_type)
    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable_dict.keys()))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_join_attr_index])
        r_len = len(r_string)
        # check for empty string
        if not r_string:
            continue
        r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type)
        r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens,
                                                      token_ordering)
        candidates = find_candidates_prefix_filter(
                         r_ordered_tokens, len(r_ordered_tokens),
                         prefix_filter, prefix_index) 
        for cand in candidates:
            if r_len - threshold <= l_join_attr_dict[cand] <= r_len + threshold:
                edit_dist = sim_fn(str(ltable_dict[cand][l_join_attr_index]),
                                   r_string)
                if edit_dist <= threshold:
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                                         ltable_dict[cand], r_row,
                                         cand, r_id,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                        if out_sim_score:
                            output_row.append(edit_dist)
                        output_rows.append(output_row)
                    else:
                        output_row = [cand, r_id]
                        if out_sim_score:
                            output_row.append(edit_dist)
                        output_rows.append(output_row)

        prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
    def test_apply_matcher_with_allow_missing(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(
            lambda row: sim_func(tok.tokenize(str(row[self.l_join_attr])),
                                 tok.tokenize(str(row[self.r_join_attr]))),
            axis=1)

        # compute expected output pairs
        comp_fn = COMP_OP_MAP[comp_op]
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join(
                    (str(row[self.l_key_attr]), str(row[self.r_key_attr]))))

        # find pairs that need to be included in output due to
        # the presence of missing value in one of the join attributes.
        missing_pairs = set()
        for l_idx, l_row in self.orig_ltable.iterrows():
            for r_idx, r_row in self.orig_rtable.iterrows():
                if (pd.isnull(l_row[self.l_join_attr])
                        or pd.isnull(r_row[self.r_join_attr])):
                    missing_pairs.add(','.join((str(l_row[self.l_key_attr]),
                                                str(r_row[self.r_key_attr]))))

        # add the pairs containing missing value to the set of expected pairs.
        expected_pairs = expected_pairs.union(missing_pairs)

        # use overlap filter to obtain a candset with allow_missing set to True.
        overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True)
        candset = overlap_filter.filter_tables(
            self.orig_ltable, self.orig_rtable, self.l_key_attr,
            self.r_key_attr, self.l_join_attr, self.r_join_attr)

        # apply a jaccard matcher to the candset with allow_missing set to True.
        output_candset = apply_matcher(candset,
                                       DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                                       DEFAULT_R_OUT_PREFIX + self.r_key_attr,
                                       self.orig_ltable,
                                       self.orig_rtable,
                                       self.l_key_attr,
                                       self.r_key_attr,
                                       self.l_join_attr,
                                       self.r_join_attr,
                                       tok,
                                       sim_func,
                                       threshold,
                                       comp_op,
                                       True,
                                       out_sim_score=True)

        expected_output_attrs = [
            '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr,
            DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score'
        ]

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join(
                (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                 str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
def _edit_distance_join_split(ltable_list, rtable_list,
                              l_columns, r_columns,
                              l_key_attr, r_key_attr,
                              l_join_attr, r_join_attr,
                              tokenizer, threshold, comp_op,
                              l_out_attrs, r_out_attrs,
                              l_out_prefix, r_out_prefix,
                              out_sim_score, show_progress):
    """Perform edit distance join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    sim_measure_type = 'EDIT_DISTANCE'
    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable_list, rtable_list],
                         [l_join_attr_index, r_join_attr_index],
                         tokenizer, sim_measure_type)

    # cache l_join_attr lengths
    l_join_attr_list = []
    for row in ltable_list:
        l_join_attr_list.append(len(row[l_join_attr_index]))

    # Build prefix index on l_join_attr
    prefix_index = PrefixIndex(ltable_list, l_join_attr_index,
                               tokenizer, sim_measure_type, threshold,
                               token_ordering)
    prefix_index.build(False)

    prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold)

    comp_fn = COMP_OP_MAP[comp_op]
    sim_fn = get_sim_function(sim_measure_type)

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]
        r_len = len(r_string)

        r_ordered_tokens = order_using_token_ordering(
                tokenizer.tokenize(r_string), token_ordering)

        # obtain candidates by applying prefix filter. 
        candidates = prefix_filter.find_candidates(r_ordered_tokens,
                                                   prefix_index)

        for cand in candidates:
            if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold:
                l_row = ltable_list[cand]

                # compute the actual edit distance                           
                edit_dist = sim_fn(l_row[l_join_attr_index], r_string)

                if comp_fn(edit_dist, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                                         l_row, r_row,
                                         l_key_attr_index, r_key_attr_index,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                    else:
                        output_row = [l_row[l_key_attr_index],
                                      r_row[r_key_attr_index]]

                    # if out_sim_score flag is set, append the edit distance 
                    # score to the output record.
                    if out_sim_score:
                        output_row.append(edit_dist)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Beispiel #15
0
def _edit_distance_join_split(ltable_list, rtable_list, l_columns, r_columns,
                              l_key_attr, r_key_attr, l_join_attr, r_join_attr,
                              tokenizer, threshold, comp_op, l_out_attrs,
                              r_out_attrs, l_out_prefix, r_out_prefix,
                              out_sim_score, show_progress):
    """Perform edit distance join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    sim_measure_type = 'EDIT_DISTANCE'
    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
        [ltable_list, rtable_list], [l_join_attr_index, r_join_attr_index],
        tokenizer, sim_measure_type)

    # cache l_join_attr lengths
    l_join_attr_list = []
    for row in ltable_list:
        l_join_attr_list.append(len(row[l_join_attr_index]))

    # Build prefix index on l_join_attr
    prefix_index = PrefixIndex(ltable_list, l_join_attr_index, tokenizer,
                               sim_measure_type, threshold, token_ordering)
    prefix_index.build(False)

    prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold)

    comp_fn = COMP_OP_MAP[comp_op]
    sim_fn = get_sim_function(sim_measure_type)

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]
        r_len = len(r_string)

        r_ordered_tokens = order_using_token_ordering(
            tokenizer.tokenize(r_string), token_ordering)

        # obtain candidates by applying prefix filter.
        candidates = prefix_filter.find_candidates(r_ordered_tokens,
                                                   prefix_index)

        for cand in candidates:
            if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold:
                l_row = ltable_list[cand]

                # compute the actual edit distance
                edit_dist = sim_fn(l_row[l_join_attr_index], r_string)

                if comp_fn(edit_dist, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                            l_row, r_row, l_key_attr_index, r_key_attr_index,
                            l_out_attrs_indices, r_out_attrs_indices)
                    else:
                        output_row = [
                            l_row[l_key_attr_index], r_row[r_key_attr_index]
                        ]

                    # if out_sim_score flag is set, append the edit distance
                    # score to the output record.
                    if out_sim_score:
                        output_row.append(edit_dist)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def set_sim_join(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr, tokenizer, sim_measure_type,
                 threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs,
                 l_out_prefix, r_out_prefix, out_sim_score, show_progress):
    """Perform set similarity join for a split of ltable and rtable"""

    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
        [ltable, rtable], [l_join_attr_index, r_join_attr_index], tokenizer,
        sim_measure_type)

    # Build position index on l_join_attr
    position_index = PositionIndex(ltable, l_join_attr_index, tokenizer,
                                   sim_measure_type, threshold, token_ordering)
    # While building the index, we cache the tokens and the empty records.
    # We cache the tokens so that we need not tokenize each string in
    # l_join_attr multiple times when we need to compute the similarity measure.
    # Further we cache the empty record ids to handle the allow_empty flag.
    cached_data = position_index.build(allow_empty, cache_tokens=True)
    l_empty_records = cached_data['empty_records']
    cached_l_tokens = cached_data['cached_tokens']

    pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold)

    sim_fn = get_sim_function(sim_measure_type)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable))
    k = 0
    for r_row in rtable:
        r_string = r_row[r_join_attr_index]

        # order the tokens using the token ordering.
        r_ordered_tokens = order_using_token_ordering(
            tokenizer.tokenize(r_string), token_ordering)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining
        # the current rtable record with those records in ltable with empty set
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the position
        # index.
        if allow_empty and len(r_ordered_tokens) == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                        ltable[l_id], r_row, l_key_attr_index,
                        r_key_attr_index, l_out_attrs_indices,
                        r_out_attrs_indices)
                else:
                    output_row = [
                        ltable[l_id][l_key_attr_index], r_row[r_key_attr_index]
                    ]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # obtain candidates by applying position filter.
        candidate_overlap = pos_filter.find_candidates(r_ordered_tokens,
                                                       position_index)

        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                l_ordered_tokens = cached_l_tokens[cand]
                k += 1
                # compute the actual similarity score
                sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens)

                if comp_fn(sim_score, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                            ltable[cand], r_row, l_key_attr_index,
                            r_key_attr_index, l_out_attrs_indices,
                            r_out_attrs_indices)
                    else:
                        output_row = [
                            ltable[cand][l_key_attr_index],
                            r_row[r_key_attr_index]
                        ]

                    # if out_sim_score flag is set, append the similarity score
                    # to the output record.
                    if out_sim_score:
                        output_row.append(sim_score)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()
    print 'k : ', k
    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Beispiel #17
0
def test_valid_join(scenario, sim_measure_type, args):
    (ltable_path, l_key_attr, l_join_attr) = scenario[0]
    (rtable_path, r_key_attr, r_join_attr) = scenario[1]
    join_fn = JOIN_FN_MAP[sim_measure_type]

    # load input tables for the tests.
    ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path))
    rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path))

    # generate cartesian product to be used as candset
    ltable['tmp_join_key'] = 1
    rtable['tmp_join_key'] = 1
    cartprod = pd.merge(ltable[[l_key_attr, l_join_attr, 'tmp_join_key']],
                        rtable[[r_key_attr, r_join_attr, 'tmp_join_key']],
                        on='tmp_join_key').drop('tmp_join_key', 1)
    ltable.drop('tmp_join_key', 1)
    rtable.drop('tmp_join_key', 1)

    sim_func = get_sim_function(sim_measure_type)

    # apply sim function to the entire cartesian product to obtain
    # the expected set of pairs satisfying the threshold.
    cartprod['sim_score'] = cartprod.apply(lambda row: sim_func(
        tokenize(str(row[l_join_attr]), args[0], sim_measure_type),
        tokenize(str(row[r_join_attr]), args[0], sim_measure_type)),
                                           axis=1)

    expected_pairs = set()
    for idx, row in cartprod.iterrows():
        if float(row['sim_score']) >= args[1]:
            expected_pairs.add(','.join(
                (str(row[l_key_attr]), str(row[r_key_attr]))))

    # use join function to obtain actual output pairs.
    actual_candset = join_fn(ltable, rtable, l_key_attr, r_key_attr,
                             l_join_attr, r_join_attr, *args)

    expected_output_attrs = ['_id']
    l_out_prefix = DEFAULT_L_OUT_PREFIX
    r_out_prefix = DEFAULT_R_OUT_PREFIX

    # Check for l_out_prefix in args.
    if len(args) > 4:
        l_out_prefix = args[4]
    expected_output_attrs.append(l_out_prefix + l_key_attr)

    # Check for l_out_attrs in args.
    if len(args) > 2:
        if args[2]:
            for attr in args[2]:
                expected_output_attrs.append(l_out_prefix + attr)

    # Check for r_out_prefix in args.
    if len(args) > 5:
        r_out_prefix = args[5]
    expected_output_attrs.append(r_out_prefix + r_key_attr)

    # Check for r_out_attrs in args.
    if len(args) > 3:
        if args[3]:
            for attr in args[3]:
                expected_output_attrs.append(r_out_prefix + attr)

    # Check for out_sim_score in args.
    if len(args) > 6:
        if args[6]:
            expected_output_attrs.append('_sim_score')
    else:
        expected_output_attrs.append('_sim_score')

    # verify whether the output table has the necessary attributes.
    assert_list_equal(list(actual_candset.columns.values),
                      expected_output_attrs)

    actual_pairs = set()
    for idx, row in actual_candset.iterrows():
        actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                   str(row[r_out_prefix + r_key_attr]))))

    # verify whether the actual pairs and the expected pairs match.
    assert_equal(len(expected_pairs), len(actual_pairs))
    common_pairs = actual_pairs.intersection(expected_pairs)
    assert_equal(len(common_pairs), len(expected_pairs))
    def test_filter_tables(self, tokenizer, sim_measure_type, threshold,
                           allow_empty, allow_missing, args):
        suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold,
                                     allow_empty, allow_missing)

        sim_fn = get_sim_function(sim_measure_type)
        # compute the join output pairs
        join_output_pairs = set()
        for l_idx, l_row in args[0].iterrows():
            for r_idx, r_row in args[1].iterrows():
                # if allow_missing is set to True, then add pairs containing
                # missing value to the join output.
                if pd.isnull(l_row[args[4]]) or pd.isnull(r_row[args[5]]):
                    if allow_missing:
                        join_output_pairs.add(','.join(
                            (str(l_row[args[2]]), str(r_row[args[3]]))))
                    continue

                if sim_measure_type == 'EDIT_DISTANCE':
                    l_join_val = str(l_row[args[4]])
                    r_join_val = str(r_row[args[5]])
                    comp_fn = COMP_OP_MAP['<=']
                else:
                    l_join_val = tokenizer.tokenize(str(l_row[args[4]]))
                    r_join_val = tokenizer.tokenize(str(r_row[args[5]]))
                    comp_fn = COMP_OP_MAP['>=']

                if (len(l_join_val) == 0 and len(r_join_val) == 0 and
                        sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']):
                    if allow_empty:
                        join_output_pairs.add(','.join(
                            (str(l_row[args[2]]), str(r_row[args[3]]))))
                    continue

                # if both attributes are not missing and not empty, then check
                # if the pair satisfies the join condition. If yes, then add it
                # to the join output.
                if comp_fn(sim_fn(l_join_val, r_join_val), threshold):
                    join_output_pairs.add(','.join(
                        (str(l_row[args[2]]), str(r_row[args[3]]))))

        actual_candset = suffix_filter.filter_tables(*args)

        expected_output_attrs = ['_id']
        l_out_prefix = self.default_l_out_prefix
        r_out_prefix = self.default_r_out_prefix

        # Check for l_out_prefix in args.
        if len(args) > 8:
            l_out_prefix = args[8]
        expected_output_attrs.append(l_out_prefix + args[2])

        # Check for r_out_prefix in args.
        if len(args) > 9:
            r_out_prefix = args[9]
        expected_output_attrs.append(r_out_prefix + args[3])

        # Check for l_out_attrs in args.
        if len(args) > 6:
            if args[6]:
                l_out_attrs = remove_redundant_attrs(args[6], args[2])
                for attr in l_out_attrs:
                    expected_output_attrs.append(l_out_prefix + attr)

        # Check for r_out_attrs in args.
        if len(args) > 7:
            if args[7]:
                r_out_attrs = remove_redundant_attrs(args[7], args[3])
                for attr in r_out_attrs:
                    expected_output_attrs.append(r_out_prefix + attr)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_candset.columns.values),
                          expected_output_attrs)

        actual_pairs = set()
        for idx, row in actual_candset.iterrows():
            actual_pairs.add(','.join((str(int(row[l_out_prefix + args[2]])),
                                       str(int(row[r_out_prefix + args[3]])))))

        # verify whether all the join output pairs are
        # present in the actual output pairs
        common_pairs = actual_pairs.intersection(join_output_pairs)
        assert_equal(len(common_pairs), len(join_output_pairs))
Beispiel #19
0
def _set_sim_join_split(ltable, rtable,
                        l_key_attr, r_key_attr,
                        l_join_attr, r_join_attr,
                        tokenizer,
                        sim_measure_type,
                        threshold,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix,
                        out_sim_score):
    """Perform set similarity join for a split of ltable and rtable"""

    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_join_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_join_attr_index)

    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable_dict.values(),
                          rtable_dict.values()],
                         [l_join_attr_index,
                          r_join_attr_index],
                         tokenizer, sim_measure_type)

    # build a dictionary of tokenized l_join_attr
    l_join_attr_dict = {}
    for row in ltable_dict.values():
        l_join_attr_dict[row[l_key_attr_index]] = order_using_token_ordering(
            tokenize(str(row[l_join_attr_index]), tokenizer, sim_measure_type),
                                                  token_ordering)

    # Build position index on l_join_attr
    position_index = PositionIndex(ltable_dict.values(),
                                   l_key_attr_index, l_join_attr_index,
                                   tokenizer, sim_measure_type,
                                   threshold, token_ordering)
    position_index.build()

    pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold)
    suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold)
    sim_fn = get_sim_function(sim_measure_type)
    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable_dict.keys()))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_join_attr_index])
        # check for empty string
        if not r_string:
            continue
        r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type)
        r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens,
                                                      token_ordering)
        r_num_tokens = len(r_ordered_tokens)
        r_prefix_length = get_prefix_length(r_num_tokens,
                                            sim_measure_type,
                                            threshold, tokenizer)     

        candidate_overlap = find_candidates_position_filter(
                                r_ordered_tokens, r_num_tokens, r_prefix_length,
                                pos_filter, position_index)
        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                l_ordered_tokens = l_join_attr_dict[cand]
                l_num_tokens = position_index.get_size(cand)
                l_prefix_length = get_prefix_length(
                                      l_num_tokens,
                                      sim_measure_type,
                                      threshold, tokenizer)
                if not suffix_filter._filter_suffix(
                           l_ordered_tokens[l_prefix_length:],
                           r_ordered_tokens[r_prefix_length:],
                           l_prefix_length,
                           r_prefix_length,
                           l_num_tokens, r_num_tokens):
                    sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens)
                    if sim_score >= threshold:
                        if has_output_attributes:
                            output_row = get_output_row_from_tables(
                                             ltable_dict[cand], r_row,
                                             cand, r_id,
                                             l_out_attrs_indices,
                                             r_out_attrs_indices)
                            if out_sim_score:
                                output_row.append(sim_score)
                            output_rows.append(output_row)
                        else:
                            output_row = [cand, r_id]
                            if out_sim_score:
                                output_row.append(sim_score)
                            output_rows.append(output_row)
        prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
    def test_filter_tables(self, tokenizer, sim_measure_type, threshold,
                           allow_empty, allow_missing, args):
        suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold,
                                     allow_empty, allow_missing)
        
        sim_fn = get_sim_function(sim_measure_type)
        # compute the join output pairs
        join_output_pairs = set()
        for l_idx, l_row in args[0].iterrows():
            for r_idx, r_row in args[1].iterrows():
                # if allow_missing is set to True, then add pairs containing
                # missing value to the join output.
                if pd.isnull(l_row[args[4]]) or pd.isnull(r_row[args[5]]):
                    if allow_missing:
                        join_output_pairs.add(','.join((str(l_row[args[2]]),
                                                        str(r_row[args[3]]))))
                    continue
 
                if sim_measure_type == 'EDIT_DISTANCE':
                    l_join_val = str(l_row[args[4]])
                    r_join_val = str(r_row[args[5]])
                    comp_fn = COMP_OP_MAP['<='] 
                else:
                    l_join_val = tokenizer.tokenize(str(l_row[args[4]]))
                    r_join_val = tokenizer.tokenize(str(r_row[args[5]]))
                    comp_fn = COMP_OP_MAP['>=']

                if (len(l_join_val) == 0 and len(r_join_val) == 0 and 
                    sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']):
                    if allow_empty:
                        join_output_pairs.add(','.join((str(l_row[args[2]]),
                                                        str(r_row[args[3]]))))
                    continue

                # if both attributes are not missing and not empty, then check 
                # if the pair satisfies the join condition. If yes, then add it 
                # to the join output.
                if comp_fn(sim_fn(l_join_val, r_join_val), threshold):
                    join_output_pairs.add(','.join((str(l_row[args[2]]),
                                                    str(r_row[args[3]]))))

        
        actual_candset = suffix_filter.filter_tables(*args)

        expected_output_attrs = ['_id']
        l_out_prefix = self.default_l_out_prefix
        r_out_prefix = self.default_r_out_prefix

        # Check for l_out_prefix in args.
        if len(args) > 8:
            l_out_prefix = args[8]
        expected_output_attrs.append(l_out_prefix + args[2])

        # Check for r_out_prefix in args.
        if len(args) > 9:
            r_out_prefix = args[9]
        expected_output_attrs.append(r_out_prefix + args[3])

        # Check for l_out_attrs in args.
        if len(args) > 6:
            if args[6]:
                l_out_attrs = remove_redundant_attrs(args[6], args[2])
                for attr in l_out_attrs:
                    expected_output_attrs.append(l_out_prefix + attr)

        # Check for r_out_attrs in args.
        if len(args) > 7:
            if args[7]:
                r_out_attrs = remove_redundant_attrs(args[7], args[3])
                for attr in r_out_attrs:
                    expected_output_attrs.append(r_out_prefix + attr)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_candset.columns.values),
                          expected_output_attrs)
 
        actual_pairs = set()
        for idx, row in actual_candset.iterrows():
            actual_pairs.add(','.join((str(int(row[l_out_prefix + args[2]])),
                                       str(int(row[r_out_prefix + args[3]])))))

        # verify whether all the join output pairs are 
        # present in the actual output pairs
        common_pairs = actual_pairs.intersection(join_output_pairs)
        assert_equal(len(common_pairs), len(join_output_pairs))
Beispiel #21
0
def test_valid_join(scenario, sim_measure_type, args, convert_to_str=False):
    (ltable_path, l_key_attr, l_join_attr) = scenario[0]
    (rtable_path, r_key_attr, r_join_attr) = scenario[1]
    join_fn = JOIN_FN_MAP[sim_measure_type]

    # load input tables for the tests.
    ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path))
    rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path))

    if convert_to_str:
        dataframe_column_to_str(ltable, l_join_attr, inplace=True)
        dataframe_column_to_str(rtable, r_join_attr, inplace=True)

    missing_pairs = set()
    # if allow_missing flag is set, compute missing pairs.
    if len(args) > 4 and args[4]:
        for l_idx, l_row in ltable.iterrows():
            for r_idx, r_row in rtable.iterrows():
                if (pd.isnull(l_row[l_join_attr])
                        or pd.isnull(r_row[r_join_attr])):
                    missing_pairs.add(','.join(
                        (str(l_row[l_key_attr]), str(r_row[r_key_attr]))))

    # remove rows with missing value in join attribute and create new dataframes
    # consisting of rows with non-missing values.
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy()
    rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy()

    if len(args) > 3 and (not args[3]):
        ltable_not_missing = ltable_not_missing[ltable_not_missing.apply(
            lambda row: len(args[0].tokenize(str(row[l_join_attr]))), 1) > 0]
        rtable_not_missing = rtable_not_missing[rtable_not_missing.apply(
            lambda row: len(args[0].tokenize(str(row[r_join_attr]))), 1) > 0]

    # generate cartesian product to be used as candset
    ltable_not_missing['tmp_join_key'] = 1
    rtable_not_missing['tmp_join_key'] = 1
    cartprod = pd.merge(
        ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']],
        rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']],
        on='tmp_join_key').drop('tmp_join_key', 1)
    ltable_not_missing.drop('tmp_join_key', 1)
    rtable_not_missing.drop('tmp_join_key', 1)

    sim_func = get_sim_function(sim_measure_type)

    # apply sim function to the entire cartesian product to obtain
    # the expected set of pairs satisfying the threshold.
    cartprod['sim_score'] = cartprod.apply(lambda row: round(
        sim_func(args[0].tokenize(str(row[l_join_attr])), args[0].tokenize(
            str(row[r_join_attr]))), 4),
                                           axis=1)

    comp_fn = COMP_OP_MAP[DEFAULT_COMP_OP]
    # Check for comp_op in args.
    if len(args) > 2:
        comp_fn = COMP_OP_MAP[args[2]]

    expected_pairs = set()
    for idx, row in cartprod.iterrows():
        if comp_fn(float(row['sim_score']), args[1]):
            expected_pairs.add(','.join(
                (str(row[l_key_attr]), str(row[r_key_attr]))))

    expected_pairs = expected_pairs.union(missing_pairs)

    orig_return_set_flag = args[0].get_return_set()

    # use join function to obtain actual output pairs.
    actual_candset = join_fn(ltable, rtable, l_key_attr, r_key_attr,
                             l_join_attr, r_join_attr, *args)

    assert_equal(args[0].get_return_set(), orig_return_set_flag)

    expected_output_attrs = ['_id']
    l_out_prefix = DEFAULT_L_OUT_PREFIX
    r_out_prefix = DEFAULT_R_OUT_PREFIX

    # Check for l_out_prefix in args.
    if len(args) > 7:
        l_out_prefix = args[7]
    expected_output_attrs.append(l_out_prefix + l_key_attr)

    # Check for r_out_prefix in args.
    if len(args) > 8:
        r_out_prefix = args[8]
    expected_output_attrs.append(r_out_prefix + r_key_attr)

    # Check for l_out_attrs in args.
    if len(args) > 5:
        if args[5]:
            l_out_attrs = remove_redundant_attrs(args[5], l_key_attr)
            for attr in l_out_attrs:
                expected_output_attrs.append(l_out_prefix + attr)

    # Check for r_out_attrs in args.
    if len(args) > 6:
        if args[6]:
            r_out_attrs = remove_redundant_attrs(args[6], r_key_attr)
            for attr in r_out_attrs:
                expected_output_attrs.append(r_out_prefix + attr)

    # Check for out_sim_score in args.
    if len(args) > 9:
        if args[9]:
            expected_output_attrs.append('_sim_score')
    else:
        expected_output_attrs.append('_sim_score')

    # verify whether the output table has the necessary attributes.
    assert_list_equal(list(actual_candset.columns.values),
                      expected_output_attrs)

    actual_pairs = set()
    for idx, row in actual_candset.iterrows():
        actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                   str(row[r_out_prefix + r_key_attr]))))

    # verify whether the actual pairs and the expected pairs match.
    assert_equal(len(expected_pairs), len(actual_pairs))
    common_pairs = actual_pairs.intersection(expected_pairs)
    assert_equal(len(common_pairs), len(expected_pairs))
def set_sim_join(ltable, rtable,
                 l_columns, r_columns,
                 l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr,
                 tokenizer, sim_measure_type, threshold, comp_op,
                 allow_empty,
                 l_out_attrs, r_out_attrs,
                 l_out_prefix, r_out_prefix,
                 out_sim_score, show_progress):
    """Perform set similarity join for a split of ltable and rtable"""

    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable, rtable],
                         [l_join_attr_index, r_join_attr_index],
                         tokenizer, sim_measure_type)

    # Build position index on l_join_attr
    position_index = PositionIndex(ltable, l_join_attr_index,
                                   tokenizer, sim_measure_type,
                                   threshold, token_ordering)
    # While building the index, we cache the tokens and the empty records.
    # We cache the tokens so that we need not tokenize each string in 
    # l_join_attr multiple times when we need to compute the similarity measure.
    # Further we cache the empty record ids to handle the allow_empty flag.
    cached_data = position_index.build(allow_empty, cache_tokens=True)
    l_empty_records = cached_data['empty_records']
    cached_l_tokens = cached_data['cached_tokens']

    pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold)

    sim_fn = get_sim_function(sim_measure_type)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable))
    k = 0
    for r_row in rtable:
        r_string = r_row[r_join_attr_index]

        # order the tokens using the token ordering.
        r_ordered_tokens = order_using_token_ordering(
                tokenizer.tokenize(r_string), token_ordering)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining 
        # the current rtable record with those records in ltable with empty set 
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the position
        # index.
        if allow_empty and len(r_ordered_tokens) == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable[l_id], r_row,
                                     l_key_attr_index, r_key_attr_index,
                                     l_out_attrs_indices,
                                     r_out_attrs_indices)
                else:
                    output_row = [ltable[l_id][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # obtain candidates by applying position filter.            
        candidate_overlap = pos_filter.find_candidates(r_ordered_tokens,
                                                       position_index)

        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                l_ordered_tokens = cached_l_tokens[cand]
                k += 1
                # compute the actual similarity score
                sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens)

                if comp_fn(sim_score, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                                         ltable[cand], r_row,
                                         l_key_attr_index, r_key_attr_index,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                    else:
                        output_row = [ltable[cand][l_key_attr_index],
                                      r_row[r_key_attr_index]]

                    # if out_sim_score flag is set, append the similarity score    
                    # to the output record.  
                    if out_sim_score:
                        output_row.append(sim_score)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()
    print 'k : ', k
    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def test_valid_join(scenario,
                    tok,
                    threshold,
                    comp_op=DEFAULT_COMP_OP,
                    args=(),
                    convert_to_str=False):
    (ltable_path, l_key_attr, l_join_attr) = scenario[0]
    (rtable_path, r_key_attr, r_join_attr) = scenario[1]

    # load input tables for the tests.
    ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path))
    rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path))

    if convert_to_str:
        dataframe_column_to_str(ltable, l_join_attr, inplace=True)
        dataframe_column_to_str(rtable, r_join_attr, inplace=True)

    missing_pairs = set()
    # if allow_missing flag is set, compute missing pairs.
    if len(args) > 0 and args[0]:
        for l_idx, l_row in ltable.iterrows():
            for r_idx, r_row in rtable.iterrows():
                if (pd.isnull(l_row[l_join_attr])
                        or pd.isnull(r_row[r_join_attr])):
                    missing_pairs.add(','.join(
                        (str(l_row[l_key_attr]), str(r_row[r_key_attr]))))

    # remove rows with missing value in join attribute and create new dataframes
    # consisting of rows with non-missing values.
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy()
    rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy()

    # generate cartesian product to be used as candset
    ltable_not_missing['tmp_join_key'] = 1
    rtable_not_missing['tmp_join_key'] = 1
    cartprod = pd.merge(
        ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']],
        rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']],
        on='tmp_join_key').drop('tmp_join_key', 1)
    ltable_not_missing.drop('tmp_join_key', 1)
    rtable_not_missing.drop('tmp_join_key', 1)

    sim_measure_type = 'EDIT_DISTANCE'
    sim_func = get_sim_function(sim_measure_type)

    # apply sim function to the entire cartesian product to obtain
    # the expected set of pairs satisfying the threshold.
    cartprod['sim_score'] = cartprod.apply(
        lambda row: sim_func(str(row[l_join_attr]), str(row[r_join_attr])),
        axis=1)

    comp_fn = COMP_OP_MAP[comp_op]

    expected_pairs = set()
    overlap = get_sim_function('OVERLAP')
    for idx, row in cartprod.iterrows():
        l_tokens = tok.tokenize(str(row[l_join_attr]))
        r_tokens = tok.tokenize(str(row[r_join_attr]))

        if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0:
            continue

        # current edit distance join is approximate. It cannot find matching
        # strings which don't have any common q-grams. Hence, remove pairs
        # that don't have any common q-grams from expected pairs.
        if comp_fn(float(row['sim_score']), threshold):
            if overlap(l_tokens, r_tokens) > 0:
                expected_pairs.add(','.join(
                    (str(row[l_key_attr]), str(row[r_key_attr]))))

    expected_pairs = expected_pairs.union(missing_pairs)

    orig_return_set_flag = tok.get_return_set()

    # use join function to obtain actual output pairs.
    actual_candset = edit_distance_join(ltable,
                                        rtable,
                                        l_key_attr,
                                        r_key_attr,
                                        l_join_attr,
                                        r_join_attr,
                                        threshold,
                                        comp_op,
                                        *args,
                                        tokenizer=tok)

    assert_equal(tok.get_return_set(), orig_return_set_flag)

    expected_output_attrs = ['_id']
    l_out_prefix = DEFAULT_L_OUT_PREFIX
    r_out_prefix = DEFAULT_R_OUT_PREFIX

    # Check for l_out_prefix in args.
    if len(args) > 3:
        l_out_prefix = args[3]
    expected_output_attrs.append(l_out_prefix + l_key_attr)

    # Check for r_out_prefix in args.
    if len(args) > 4:
        r_out_prefix = args[4]
    expected_output_attrs.append(r_out_prefix + r_key_attr)

    # Check for l_out_attrs in args.
    if len(args) > 1:
        if args[1]:
            l_out_attrs = remove_redundant_attrs(args[1], l_key_attr)
            for attr in l_out_attrs:
                expected_output_attrs.append(l_out_prefix + attr)

    # Check for r_out_attrs in args.
    if len(args) > 2:
        if args[2]:
            r_out_attrs = remove_redundant_attrs(args[2], r_key_attr)
            for attr in r_out_attrs:
                expected_output_attrs.append(r_out_prefix + attr)

    # Check for out_sim_score in args.
    if len(args) > 5:
        if args[5]:
            expected_output_attrs.append('_sim_score')
    else:
        expected_output_attrs.append('_sim_score')

    # verify whether the output table has the necessary attributes.
    assert_list_equal(list(actual_candset.columns.values),
                      expected_output_attrs)

    actual_pairs = set()
    for idx, row in actual_candset.iterrows():
        actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                   str(row[r_out_prefix + r_key_attr]))))

    # verify whether the actual pairs and the expected pairs match.
    assert_equal(len(expected_pairs), len(actual_pairs))
    common_pairs = actual_pairs.intersection(expected_pairs)
    assert_equal(len(common_pairs), len(expected_pairs))
def test_valid_join(scenario, tok, threshold,comp_op=DEFAULT_COMP_OP, args=(),
                    convert_to_str=False,data_limit=100000,temp_dir = os.getcwd(), 
                    output_file_path = default_output_file_path):
    (ltable_path, l_key_attr, l_join_attr) = scenario[0]
    (rtable_path, r_key_attr, r_join_attr) = scenario[1]

    # load input tables for the tests.
    ltable = pd.read_csv(os.path.join(os.path.dirname(__file__),
                                      ltable_path))
    rtable = pd.read_csv(os.path.join(os.path.dirname(__file__),
                                      rtable_path))

    if convert_to_str:                                                          
        dataframe_column_to_str(ltable, l_join_attr, inplace=True)              
        dataframe_column_to_str(rtable, r_join_attr, inplace=True) 

    missing_pairs = set()
    # if allow_missing flag is set, compute missing pairs.
    if len(args) > 0 and args[0]:
        for l_idx, l_row in ltable.iterrows():
            for r_idx, r_row in rtable.iterrows():
                if (pd.isnull(l_row[l_join_attr]) or
                    pd.isnull(r_row[r_join_attr])):
                    missing_pairs.add(','.join((str(l_row[l_key_attr]),
                                                str(r_row[r_key_attr]))))

    # remove rows with missing value in join attribute and create new dataframes
    # consisting of rows with non-missing values.
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy()
    rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy()

    # generate cartesian product to be used as candset
    ltable_not_missing['tmp_join_key'] = 1
    rtable_not_missing['tmp_join_key'] = 1
    cartprod = pd.merge(ltable_not_missing[[l_key_attr,
                                l_join_attr,
                                'tmp_join_key']],
                        rtable_not_missing[[r_key_attr,
                                r_join_attr,
                                'tmp_join_key']],
                        on='tmp_join_key').drop('tmp_join_key', 1)
    ltable_not_missing.drop('tmp_join_key', 1)
    rtable_not_missing.drop('tmp_join_key', 1)

    sim_measure_type = 'EDIT_DISTANCE'
    sim_func = get_sim_function(sim_measure_type)

    # apply sim function to the entire cartesian product to obtain
    # the expected set of pairs satisfying the threshold.
    cartprod['sim_score'] = cartprod.apply(lambda row: sim_func(
                str(row[l_join_attr]), str(row[r_join_attr])),
            axis=1)

    comp_fn = COMP_OP_MAP[comp_op]

    expected_pairs = set()
    overlap = get_sim_function('OVERLAP')
    for idx, row in cartprod.iterrows():
        l_tokens = tok.tokenize(str(row[l_join_attr]))
        r_tokens = tok.tokenize(str(row[r_join_attr]))

        if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0:
            continue

        # current edit distance join is approximate. It cannot find matching
        # strings which don't have any common q-grams. Hence, remove pairs
        # that don't have any common q-grams from expected pairs.
        if comp_fn(float(row['sim_score']), threshold):
            if overlap(l_tokens, r_tokens) > 0:
                expected_pairs.add(','.join((str(row[l_key_attr]),
                                             str(row[r_key_attr]))))

    expected_pairs = expected_pairs.union(missing_pairs)

    orig_return_set_flag = tok.get_return_set()
    
    # Removing any previously existing output file path.
    if os.path.exists(output_file_path):
      os.remove(output_file_path)

    # Use join function to process the input data. It returns the boolean value.
    is_success = disk_edit_distance_join(ltable, rtable,
                                         l_key_attr, r_key_attr,
                                         l_join_attr, r_join_attr,
                                         threshold, data_limit,
                                         comp_op, *args,
                                         tokenizer=tok, temp_dir = temp_dir,
                                         output_file_path = output_file_path)
    # Use edit distance join without the disk version to get the dataframe to compare.
    no_disk_candset = edit_distance_join(ltable, rtable,
                                        l_key_attr, r_key_attr,
                                        l_join_attr, r_join_attr,
                                        threshold, comp_op,
                                        *args, tokenizer=tok)
    # Deleting Id to make the schema consistent for comparison.
    if '_id' in no_disk_candset :
      del no_disk_candset['_id']

    assert_equal(tok.get_return_set(), orig_return_set_flag)

    expected_output_attrs = []
    l_out_prefix = DEFAULT_L_OUT_PREFIX
    r_out_prefix = DEFAULT_R_OUT_PREFIX

    # Check for l_out_prefix in args.
    if len(args) > 3:
        l_out_prefix = args[3]
    expected_output_attrs.append(l_out_prefix + l_key_attr)

    # Check for r_out_prefix in args.
    if len(args) > 4:
        r_out_prefix = args[4]
    expected_output_attrs.append(r_out_prefix + r_key_attr)

    # Check for l_out_attrs in args.
    if len(args) > 1:
        if args[1]:
            l_out_attrs = remove_redundant_attrs(args[1], l_key_attr)
            for attr in l_out_attrs:
                expected_output_attrs.append(l_out_prefix + attr)

    # Check for r_out_attrs in args.
    if len(args) > 2:
        if args[2]:
            r_out_attrs = remove_redundant_attrs(args[2], r_key_attr)
            for attr in r_out_attrs:
                expected_output_attrs.append(r_out_prefix + attr)

    # Check for out_sim_score in args. 
    if len(args) > 5:
        if args[5]:
            expected_output_attrs.append('_sim_score')
    else:
        expected_output_attrs.append('_sim_score')

    # Verify whether the current output file path exists.
    assert_equal(True,os.path.exists(output_file_path))

    # verify whether the output table has the necessary attributes.
    actual_candset = pd.read_csv(output_file_path)

    # Comparing column header values
    assert_list_equal(list(actual_candset.columns.values),
                        expected_output_attrs)
    assert_list_equal(list(no_disk_candset.columns.values),
                        list(actual_candset.columns.values))

    actual_pairs = set()
    no_disk_pairs = set() 

    # Creating sets for comparing the data tuples
    for idx, row in actual_candset.iterrows():
        actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                     str(row[r_out_prefix + r_key_attr]))))

    for idx, row in no_disk_candset.iterrows():
        no_disk_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                     str(row[r_out_prefix + r_key_attr]))))
   
    # Verify whether the actual pairs and the expected pairs match.
    assert_equal(len(expected_pairs), len(actual_pairs))
    assert_equal(len(expected_pairs), len(no_disk_pairs))
    common_pairs = actual_pairs.intersection(expected_pairs)
    common_pairs_no_disk = no_disk_pairs.intersection(expected_pairs)
    assert_equal(len(common_pairs), len(expected_pairs))
    assert_equal(len(common_pairs_no_disk), len(expected_pairs))
def test_valid_join(scenario, sim_measure_type, args, convert_to_str=False):
    (ltable_path, l_key_attr, l_join_attr) = scenario[0]
    (rtable_path, r_key_attr, r_join_attr) = scenario[1]
    join_fn = JOIN_FN_MAP[sim_measure_type]

    # load input tables for the tests.
    ltable = pd.read_csv(os.path.join(os.path.dirname(__file__),
                                      ltable_path))
    rtable = pd.read_csv(os.path.join(os.path.dirname(__file__),
                                      rtable_path))

    if convert_to_str:
        dataframe_column_to_str(ltable, l_join_attr, inplace=True)
        dataframe_column_to_str(rtable, r_join_attr, inplace=True)

    missing_pairs = set()
    # if allow_missing flag is set, compute missing pairs.
    if len(args) > 4 and args[4]:
        for l_idx, l_row in ltable.iterrows():
            for r_idx, r_row in rtable.iterrows(): 
                if (pd.isnull(l_row[l_join_attr]) or
                    pd.isnull(r_row[r_join_attr])):
                    missing_pairs.add(','.join((str(l_row[l_key_attr]),
                                                str(r_row[r_key_attr]))))

    # remove rows with missing value in join attribute and create new dataframes
    # consisting of rows with non-missing values.
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy()
    rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy()

    if len(args) > 3 and (not args[3]):
        ltable_not_missing = ltable_not_missing[ltable_not_missing.apply(
            lambda row: len(args[0].tokenize(str(row[l_join_attr]))), 1) > 0]
        rtable_not_missing = rtable_not_missing[rtable_not_missing.apply(
            lambda row: len(args[0].tokenize(str(row[r_join_attr]))), 1) > 0]

    # generate cartesian product to be used as candset
    ltable_not_missing['tmp_join_key'] = 1
    rtable_not_missing['tmp_join_key'] = 1
    cartprod = pd.merge(ltable_not_missing[[l_key_attr,
                                l_join_attr,
                                'tmp_join_key']],
                        rtable_not_missing[[r_key_attr,
                                r_join_attr,
                                'tmp_join_key']],
                        on='tmp_join_key').drop('tmp_join_key', 1)
    ltable_not_missing.drop('tmp_join_key', 1)
    rtable_not_missing.drop('tmp_join_key', 1)

    sim_func = get_sim_function(sim_measure_type)

    # apply sim function to the entire cartesian product to obtain
    # the expected set of pairs satisfying the threshold.
    cartprod['sim_score'] = cartprod.apply(lambda row: round(sim_func(
                args[0].tokenize(str(row[l_join_attr])),
                args[0].tokenize(str(row[r_join_attr]))), 4),
            axis=1)
   
    comp_fn = COMP_OP_MAP[DEFAULT_COMP_OP]
    # Check for comp_op in args.
    if len(args) > 2:
        comp_fn = COMP_OP_MAP[args[2]]

    expected_pairs = set()
    for idx, row in cartprod.iterrows():
        if comp_fn(float(row['sim_score']), args[1]):
            expected_pairs.add(','.join((str(row[l_key_attr]),
                                         str(row[r_key_attr]))))

    expected_pairs = expected_pairs.union(missing_pairs)

    orig_return_set_flag = args[0].get_return_set()

    # use join function to obtain actual output pairs.
    actual_candset = join_fn(ltable, rtable,
                             l_key_attr, r_key_attr,
                             l_join_attr, r_join_attr,
                             *args)

    assert_equal(args[0].get_return_set(), orig_return_set_flag)

    expected_output_attrs = ['_id']
    l_out_prefix = DEFAULT_L_OUT_PREFIX
    r_out_prefix = DEFAULT_R_OUT_PREFIX

    # Check for l_out_prefix in args.
    if len(args) > 7:
        l_out_prefix = args[7]
    expected_output_attrs.append(l_out_prefix + l_key_attr)

    # Check for r_out_prefix in args.
    if len(args) > 8:
        r_out_prefix = args[8]
    expected_output_attrs.append(r_out_prefix + r_key_attr)

    # Check for l_out_attrs in args.
    if len(args) > 5:
        if args[5]:
            l_out_attrs = remove_redundant_attrs(args[5], l_key_attr)
            for attr in l_out_attrs:
                expected_output_attrs.append(l_out_prefix + attr)

    # Check for r_out_attrs in args.
    if len(args) > 6:
        if args[6]:
            r_out_attrs = remove_redundant_attrs(args[6], r_key_attr)
            for attr in r_out_attrs:
                expected_output_attrs.append(r_out_prefix + attr)

    # Check for out_sim_score in args. 
    if len(args) > 9:
        if args[9]:
            expected_output_attrs.append('_sim_score')
    else:
        expected_output_attrs.append('_sim_score')

    # verify whether the output table has the necessary attributes.
    assert_list_equal(list(actual_candset.columns.values),
                      expected_output_attrs)

    actual_pairs = set()
    for idx, row in actual_candset.iterrows():
        actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                   str(row[r_out_prefix + r_key_attr]))))
   
    # verify whether the actual pairs and the expected pairs match.
    assert_equal(len(expected_pairs), len(actual_pairs))
    common_pairs = actual_pairs.intersection(expected_pairs)
    assert_equal(len(common_pairs), len(expected_pairs))
    def test_apply_matcher_with_allow_missing(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(lambda row: sim_func(
                tok.tokenize(str(row[self.l_join_attr])),
                tok.tokenize(str(row[self.r_join_attr]))),
            axis=1)

        # compute expected output pairs
        comp_fn = COMP_OP_MAP[comp_op]
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join((str(row[self.l_key_attr]),
                                             str(row[self.r_key_attr]))))

        # find pairs that need to be included in output due to
        # the presence of missing value in one of the join attributes.
        missing_pairs = set()
        for l_idx, l_row in self.orig_ltable.iterrows():
            for r_idx, r_row in self.orig_rtable.iterrows():
                if (pd.isnull(l_row[self.l_join_attr]) or
                    pd.isnull(r_row[self.r_join_attr])):
                    missing_pairs.add(','.join((str(l_row[self.l_key_attr]),
                                                str(r_row[self.r_key_attr]))))

        # add the pairs containing missing value to the set of expected pairs.
        expected_pairs = expected_pairs.union(missing_pairs)

        # use overlap filter to obtain a candset with allow_missing set to True. 
        overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True)
        candset = overlap_filter.filter_tables(self.orig_ltable, self.orig_rtable,
                              self.l_key_attr, self.r_key_attr,
                              self.l_join_attr, self.r_join_attr)

        # apply a jaccard matcher to the candset with allow_missing set to True.
        output_candset = apply_matcher(candset,
            DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr,
            self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr,
            self.l_join_attr, self.r_join_attr, tok, sim_func, threshold,
            comp_op, True, out_sim_score=True)

        expected_output_attrs=['_id',
                               DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                               DEFAULT_R_OUT_PREFIX + self.r_key_attr,
                               '_sim_score']

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                                       str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))