def test_overlap_join_using_tokenizer_with_return_set_false(self):
     A = pd.DataFrame([{'id':1, 'attr':'hello'}])
     B = pd.DataFrame([{'id':1, 'attr':'he ll'}])
     qg2_tok = QgramTokenizer(2)
     assert_equal(qg2_tok.get_return_set(), False)
     c = overlap_join(A, B, 'id', 'id', 'attr', 'attr', qg2_tok, 1)
     assert_equal(len(c), 1)
     assert_equal(qg2_tok.get_return_set(), False)        
Ejemplo n.º 2
0
 def test_overlap_join_using_tokenizer_with_return_set_false(self):
     A = pd.DataFrame([{'id': 1, 'attr': 'hello'}])
     B = pd.DataFrame([{'id': 1, 'attr': 'he ll'}])
     qg2_tok = QgramTokenizer(2)
     assert_equal(qg2_tok.get_return_set(), False)
     c = overlap_join(A, B, 'id', 'id', 'attr', 'attr', qg2_tok, 1)
     assert_equal(len(c), 1)
     assert_equal(qg2_tok.get_return_set(), False)
    def test_apply_matcher_with_join_attr_of_type_int(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='
        l_join_attr = 'A.zipcode'
        r_join_attr = 'B.zipcode'

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(
            lambda row: sim_func(tok.tokenize(str(row[l_join_attr])),
                                 tok.tokenize(str(row[r_join_attr]))),
            axis=1)

        comp_fn = COMP_OP_MAP[comp_op]
        # compute expected output pairs
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join(
                    (str(row[self.l_key_attr]), str(row[self.r_key_attr]))))

        # use overlap filter to obtain a candset.
        overlap_filter = OverlapFilter(tok, 1, comp_op)
        candset = overlap_filter.filter_tables(self.ltable, self.rtable,
                                               self.l_key_attr,
                                               self.r_key_attr, l_join_attr,
                                               r_join_attr)

        # apply a jaccard matcher to the candset
        output_candset = apply_matcher(
            candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr,
            DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable,
            self.l_key_attr, self.r_key_attr, l_join_attr, r_join_attr, tok,
            sim_func, threshold)

        expected_output_attrs = [
            '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr,
            DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score'
        ]

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join(
                (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                 str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
    def test_apply_matcher(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(lambda row: sim_func(
                tok.tokenize(str(row[self.l_join_attr])),
                tok.tokenize(str(row[self.r_join_attr]))),
            axis=1)

        comp_fn = COMP_OP_MAP[comp_op]
        # compute expected output pairs
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join((str(row[self.l_key_attr]),
                                             str(row[self.r_key_attr]))))

        # use overlap filter to obtain a candset.
        overlap_filter = OverlapFilter(tok, 1, comp_op)
        candset = overlap_filter.filter_tables(self.ltable, self.rtable,
                              self.l_key_attr, self.r_key_attr,
                              self.l_join_attr, self.r_join_attr)

        # apply a jaccard matcher to the candset
        output_candset = apply_matcher(candset,
            DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr,
            self.ltable, self.rtable, self.l_key_attr, self.r_key_attr,
            self.l_join_attr, self.r_join_attr, tok, sim_func, threshold,
            comp_op, False,
            [self.l_join_attr], [self.r_join_attr], out_sim_score=True)

        expected_output_attrs=['_id',
                               DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                               DEFAULT_R_OUT_PREFIX + self.r_key_attr,
                               DEFAULT_L_OUT_PREFIX + self.l_join_attr,
                               DEFAULT_R_OUT_PREFIX + self.r_join_attr,
                               '_sim_score']

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                                       str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
Ejemplo n.º 5
0
def get_features(sim_measures=None, tokenizers=None):
    features = []
    ws_tok = WhitespaceTokenizer(return_set=True)
    if sim_measures is None:
        sim_measures = [
            'JACCARD',
            'COSINE',
            'DICE',
            #                        'LEFT_LENGTH', 'RIGHT_LENGTH', 'LENGTH_SUM', 'LENGTH_DIFF']
            'OVERLAP_COEFFICIENT',
            'EDIT_DISTANCE',
            'LEFT_LENGTH',
            'RIGHT_LENGTH',
            'LENGTH_SUM',
            'LENGTH_DIFF'
        ]
    if tokenizers is None:
        tokenizers = {
            'alph': AlphabeticTokenizer(return_set=True),
            'alph_num': AlphanumericTokenizer(return_set=True),
            'num': NumericTokenizer(return_set=True),
            'ws': WhitespaceTokenizer(return_set=True),
            'qg2': QgramTokenizer(qval=2, return_set=True),
            'qg3': QgramTokenizer(qval=3, return_set=True)
        }
    for sim_measure_type in sim_measures:
        if sim_measure_type in [
                'EDIT_DISTANCE', 'LEFT_LENGTH', 'RIGHT_LENGTH', 'LENGTH_SUM',
                'LENGTH_DIFF'
        ]:
            features.append(
                (sim_measure_type.lower(), 'none', sim_measure_type, None,
                 get_sim_function(sim_measure_type)))
            continue
        for tok_name in tokenizers.keys():
            #            if sim_measure_type == 'COSINE' and tok_name == 'qg3':
            #                continue
            features.append((sim_measure_type.lower() + '_' + tok_name,
                             tok_name, sim_measure_type, tokenizers[tok_name],
                             get_sim_function(sim_measure_type)))

    feature_table_header = [
        'feature_name', 'tokenizer_type', 'sim_measure_type', 'tokenizer',
        'sim_function'
    ]
    feature_table = pd.DataFrame(features, columns=feature_table_header)
    feature_table = feature_table.set_index('feature_name')

    return feature_table
def edit_distance_join(ltable,
                       rtable,
                       l_key_attr,
                       r_key_attr,
                       l_join_attr,
                       r_join_attr,
                       threshold,
                       comp_op='<=',
                       allow_missing=False,
                       l_out_attrs=None,
                       r_out_attrs=None,
                       l_out_prefix='l_',
                       r_out_prefix='r_',
                       out_sim_score=True,
                       n_jobs=1,
                       show_progress=True,
                       tokenizer=QgramTokenizer(qval=2)):
    from py_stringsimjoin import __use_cython__
    if __use_cython__:
        from py_stringsimjoin.join.edit_distance_join_cy import edit_distance_join_cy
        return edit_distance_join_cy(ltable, rtable, l_key_attr, r_key_attr,
                                     l_join_attr, r_join_attr, threshold,
                                     comp_op, allow_missing, l_out_attrs,
                                     r_out_attrs, l_out_prefix, r_out_prefix,
                                     out_sim_score, n_jobs, show_progress,
                                     tokenizer)
    else:
        from py_stringsimjoin.join.edit_distance_join_py import edit_distance_join_py
        return edit_distance_join_py(ltable, rtable, l_key_attr, r_key_attr,
                                     l_join_attr, r_join_attr, threshold,
                                     comp_op, allow_missing, l_out_attrs,
                                     r_out_attrs, l_out_prefix, r_out_prefix,
                                     out_sim_score, n_jobs, show_progress,
                                     tokenizer)
    def test_candset_with_join_attr_of_type_int(self):
        A = pd.DataFrame([{'l_id': 1, 'l_attr':1990},
                          {'l_id': 2, 'l_attr':2000},
                          {'l_id': 3, 'l_attr':0},
                          {'l_id': 4, 'l_attr':-1},
                          {'l_id': 5, 'l_attr':1986}])
        B = pd.DataFrame([{'r_id': 1, 'r_attr':2001},
                          {'r_id': 2, 'r_attr':1992},
                          {'r_id': 3, 'r_attr':1886},
                          {'r_id': 4, 'r_attr':2007},
                          {'r_id': 5, 'r_attr':2012}])

        dataframe_column_to_str(A, 'l_attr', inplace=True)              
        dataframe_column_to_str(B, 'r_attr', inplace=True)  

        A['tmp_join_key'] = 1
        B['tmp_join_key'] = 1
        C = pd.merge(A[['l_id', 'tmp_join_key']],
                     B[['r_id', 'tmp_join_key']],
                 on='tmp_join_key').drop('tmp_join_key', 1)

        qg2_tok = QgramTokenizer(2, return_set=True)
        expected_pairs = set(['1,2', '1,3', '2,1', '2,4', '2,5',
                              '4,1', '5,2', '5,3'])
        self.test_filter_candset(qg2_tok, 1, '>=', False,
                                 (C, 'l_id', 'r_id',
                                  A, B, 'l_id', 'r_id',
                                  'l_attr', 'r_attr'),
                                 expected_pairs)
Ejemplo n.º 8
0
    def apply_filterable_rule(self, rule_name, l_df, r_df, l_key, r_key,
                              l_output_attrs, r_output_attrs, l_output_prefix,
                              r_output_prefix, verbose, show_progress,
                              n_chunks):
        candset = None
        conjunct_list = self.rule_str[rule_name]
        for conjunct in conjunct_list:
            is_auto_gen, sim_fn, l_attr, r_attr, l_tok, r_tok, op, th = parse_conjunct(
                conjunct, self.rule_ft[rule_name])

            if l_tok == 'dlm_dc0':
                tokenizer = WhitespaceTokenizer(return_set=True)
            elif l_tok == 'qgm_3':
                tokenizer = QgramTokenizer(qval=3, return_set=True)

            if sim_fn == 'jaccard':
                join_fn = ssj.jaccard_join
            elif sim_fn == 'cosine':
                join_fn = ssj.cosine_join
            elif sim_fn == 'dice':
                join_fn = ssj.dice_join
            elif sim_fn == 'overlap_coeff':
                join_fn = ssj.overlap_coefficient_join
            elif sim_fn == 'lev_dist':
                join_fn = ssj.edit_distance_join

            if join_fn == ssj.edit_distance_join:
                comp_op = '<='
                if op == '>=':
                    comp_op = '<'
            else:
                comp_op = '>='
                if op == '<=':
                    comp_op = '>'

            ssj.dataframe_column_to_str(l_df, l_attr, inplace=True)
            ssj.dataframe_column_to_str(r_df, r_attr, inplace=True)

            if join_fn == ssj.edit_distance_join:
                c_df = join_fn(l_df, r_df, l_key, r_key, l_attr, r_attr,
                               float(th), comp_op, True, l_output_attrs,
                               r_output_attrs, l_output_prefix,
                               r_output_prefix, False, n_chunks, show_progress)
            else:
                c_df = join_fn(l_df, r_df,
                               l_key, r_key, l_attr, r_attr, tokenizer,
                               float(th), comp_op, True, True, l_output_attrs,
                               r_output_attrs, l_output_prefix,
                               r_output_prefix, False, n_chunks, show_progress)
            if candset is not None:
                # union the candset of this conjunct with the existing candset
                candset = pd.concat([candset, c_df]).drop_duplicates(
                    [l_output_prefix + l_key,
                     r_output_prefix + r_key]).reset_index(drop=True)
            else:
                # candset from the first conjunct of the rule
                candset = c_df
        return candset
 def test_set_padding(self):
     tok = QgramTokenizer()
     self.assertEqual(tok.get_padding(), True)
     self.assertEqual(
         tok.tokenize('database'),
         ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$'])
     tok.set_padding(False)
     self.assertEqual(tok.get_padding(), False)
     self.assertEqual(tok.tokenize('database'),
                      ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se'])
 def test_invalid_candset(self):
     tok = QgramTokenizer(qval=2, return_set=True)
     sim_func = get_sim_function('JACCARD')
     threshold = 0.3
     apply_matcher([], DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                   DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable,
                   self.rtable, self.l_key_attr, self.r_key_attr,
                   self.l_join_attr, self.r_join_attr, tok, sim_func,
                   threshold)
 def test_invalid_rtable(self):
     tok = QgramTokenizer(qval=2, return_set=True)
     sim_func = get_sim_function('JACCARD')
     threshold = 0.3
     apply_matcher(pd.DataFrame([], columns=['_id', 'l_A.ID', 'r_B.ID']),
                   DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                   DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, [],
                   self.l_key_attr, self.r_key_attr, self.l_join_attr,
                   self.r_join_attr, tok, sim_func, threshold)
 def test_set_suffix_pad(self):
     tok = QgramTokenizer()
     self.assertEqual(tok.get_suffix_pad(), '$')
     self.assertEqual(
         tok.tokenize('database'),
         ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$'])
     tok.set_suffix_pad('!')
     self.assertEqual(tok.get_suffix_pad(), '!')
     self.assertEqual(
         tok.tokenize('database'),
         ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e!'])
    def __init__(self, vals):
        self.vals = sorted(vals, key=lambda x: x.lower(), reverse=True)
        self.val_map = vals if isinstance(vals, dict) else None
        # whether to show debugging info or not
        self.show = False

        jarowinkler_sim = JaroWinkler()
        levenshtein_sim = Levenshtein()
        qgtok = QgramTokenizer(qval=3, padding=True)
        jaccard_sim = Jaccard()
        Jaccard3Gram = lambda x, y: jaccard_sim.get_sim_score(
            qgtok.tokenize(x), qgtok.tokenize(y))

        self.str_sims = {
            'Jaro-Winkler': JaroWinkler().get_sim_score,
            'Levenshtein': Levenshtein().get_sim_score,
            '3gram Jaccard': Jaccard3Gram
        }

        self.linkages = ['single', 'average', 'complete']
 def test_empty_candset(self):
     tok = QgramTokenizer(qval=2, return_set=True)
     sim_func = get_sim_function('JACCARD')
     threshold = 0.3
     empty_candset = pd.DataFrame(columns=[
         DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX +
         self.r_key_attr
     ])
     apply_matcher(empty_candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                   DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable,
                   self.rtable, self.l_key_attr, self.r_key_attr,
                   self.l_join_attr, self.r_join_attr, tok, sim_func,
                   threshold)
Ejemplo n.º 15
0
    def test_candset_with_numeric_r_filter_attr(self):
        A = pd.DataFrame([{'l_id': 1, 'l_attr': '1990'}])
        B = pd.DataFrame([{'r_id': 1, 'r_attr': 2001}])

        A['tmp_join_key'] = 1
        B['tmp_join_key'] = 1
        C = pd.merge(A[['l_id', 'tmp_join_key']],
                     B[['r_id', 'tmp_join_key']],
                     on='tmp_join_key').drop('tmp_join_key', 1)

        qg2_tok = QgramTokenizer(2, return_set=True)
        overlap_filter = OverlapFilter(qg2_tok)
        overlap_filter.filter_candset(C, 'l_id', 'r_id', A, B, 'l_id', 'r_id',
                                      'l_attr', 'r_attr')
Ejemplo n.º 16
0
    def test_jac_qg2_with_filter_attr_of_type_int(self):
        A = pd.DataFrame([{
            'l_id': 1,
            'l_attr': 1990
        }, {
            'l_id': 2,
            'l_attr': 2000
        }, {
            'l_id': 3,
            'l_attr': 0
        }, {
            'l_id': 4,
            'l_attr': -1
        }, {
            'l_id': 5,
            'l_attr': 1986
        }])
        B = pd.DataFrame([{
            'r_id': 1,
            'r_attr': 2001
        }, {
            'r_id': 2,
            'r_attr': 1992
        }, {
            'r_id': 3,
            'r_attr': 1886
        }, {
            'r_id': 4,
            'r_attr': 2007
        }, {
            'r_id': 5,
            'r_attr': 2012
        }])

        dataframe_column_to_str(A, 'l_attr', inplace=True)
        dataframe_column_to_str(B, 'r_attr', inplace=True)

        qg2_tok = QgramTokenizer(2, return_set=True)
        expected_pairs = set([
            '1,1', '1,2', '1,3', '1,4', '1,5', '2,1', '2,2', '2,3', '2,4',
            '2,5', '5,1', '5,2', '5,3', '5,4', '5,5'
        ])
        self.test_filter_tables(qg2_tok, 'JACCARD', 0.8, False, False,
                                (A, B, 'l_id', 'r_id', 'l_attr', 'r_attr'),
                                expected_pairs)
Ejemplo n.º 17
0
    def test_edit_distance_qg2_2(self):
        A = pd.DataFrame([{
            'l_id': 1,
            'l_attr': '19990'
        }, {
            'l_id': 2,
            'l_attr': '200'
        }, {
            'l_id': 3,
            'l_attr': '0'
        }, {
            'l_id': 4,
            'l_attr': ''
        }, {
            'l_id': 5,
            'l_attr': np.NaN
        }])
        B = pd.DataFrame([{
            'r_id': 1,
            'r_attr': '200155'
        }, {
            'r_id': 2,
            'r_attr': '190'
        }, {
            'r_id': 3,
            'r_attr': '2010'
        }, {
            'r_id': 4,
            'r_attr': ''
        }, {
            'r_id': 5,
            'r_attr': np.NaN
        }, {
            'r_id': 6,
            'r_attr': '18950'
        }])

        qg2_tok = QgramTokenizer(2)
        expected_pairs = set(
            ['1,2', '1,6', '2,1', '2,2', '2,3', '3,2', '3,3', '4,4'])
        self.test_filter_tables(qg2_tok, 'EDIT_DISTANCE', 2, False, False,
                                (A, B, 'l_id', 'r_id', 'l_attr', 'r_attr'),
                                expected_pairs)
    def test_jac_qg2_with_filter_attr_of_type_int(self):
        A = pd.DataFrame([{'l_id': 1, 'l_attr':1990},
                          {'l_id': 2, 'l_attr':2000},
                          {'l_id': 3, 'l_attr':0},
                          {'l_id': 4, 'l_attr':-1},
                          {'l_id': 5, 'l_attr':1986}])
        B = pd.DataFrame([{'r_id': 1, 'r_attr':2001},
                          {'r_id': 2, 'r_attr':1992},
                          {'r_id': 3, 'r_attr':1886},
                          {'r_id': 4, 'r_attr':2007},
                          {'r_id': 5, 'r_attr':2012}])

        dataframe_column_to_str(A, 'l_attr', inplace=True)                      
        dataframe_column_to_str(B, 'r_attr', inplace=True)

        qg2_tok = QgramTokenizer(2, return_set=True)
        self.test_filter_tables(qg2_tok, 'JACCARD', 0.3, False, False,
                                (A, B,
                                'l_id', 'r_id', 'l_attr', 'r_attr'))
    def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr,
                     rem_stop_words=False, q_val=None, word_level=True, overlap_size=1,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     allow_missing=False, verbose=False, show_progress=True,
                     n_ltable_chunks=1, n_rtable_chunks=1):

        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.

        Blocks two tables based on the overlap of token sets of attribute
        values. Finds tuple pairs from left and right tables such that the overlap
        between (a) the set of tokens obtained by tokenizing the value of
        attribute l_overlap_attr of a tuple from the left table, and (b) the
        set of tokens obtained by tokenizing the value of attribute
        r_overlap_attr of a tuple from the right table, is above a certain
        threshold.

        Args:
            ltable (DataFrame): The left input table.

            rtable (DataFrame): The right input table.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
             (e.g., a, an, the) should be removed from the token sets of the
             overlap attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes
             values are to be tokenized as qgrams (defaults to None).

            word_level (boolean): A flag to indicate whether the overlap
             attributes should be tokenized as words (i.e, using whitespace
             as delimiter) (defaults to True).

            overlap_size (int): The minimum number of tokens that must
             overlap (defaults to 1).
            l_output_attrs (list): A list of attribute names from the left
                table to be included in the output candidate set (defaults
                to None).
            r_output_attrs (list): A list of attribute names from the right
                table to be included in the output candidate set  (defaults
                to None).

            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple in ltable with missing value in the
                                     blocking attribute will be matched with
                                     every tuple in rtable and vice versa.

            verbose (boolean): A flag to indicate whether the debug
                information should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                be displayed to the user (defaults to True).

            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            


        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.

            AssertionError: If `rtable` is not of type pandas
                DataFrame.

            AssertionError: If `l_overlap_attr` is not of type string.

            AssertionError: If `r_overlap_attr` is not of type string.

            AssertionError: If `l_output_attrs` is not of type of
             list.

            AssertionError: If `r_output_attrs` is not of type of
             list.

            AssertionError: If the values in `l_output_attrs` is not of type
             string.

            AssertionError: If the values in `r_output_attrs` is not of type
             string.

            AssertionError: If `l_output_prefix` is not of type
             string.

            AssertionError: If `r_output_prefix` is not of type
             string.

            AssertionError: If `q_val` is not of type int.

            AssertionError: If `word_level` is not of type boolean.

            AssertionError: If `overlap_size` is not of type int.

            AssertionError: If `verbose` is not of type
             boolean.

            AssertionError: If `allow_missing` is not of type boolean.

            AssertionError: If `show_progress` is not of type
             boolean.

            AssertionError: If `n_ltable_chunks` is not of type
             int.

            AssertionError: If `n_rtable_chunks` is not of type
             int.

            AssertionError: If `l_overlap_attr` is not in the ltable
             columns.

            AssertionError: If `r_block_attr` is not in the rtable columns.

            AssertionError: If `l_output_attrs` are not in the ltable.

            AssertionError: If `r_output_attrs` are not in the rtable.

            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.

            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.

        Examples:
            >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = DaskOverlapBlocker()
            # Use all cores
            # # Use word-level tokenizer
            >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1, n_ltable_chunks=-1, n_rtable_chunks=-1)
            # # Use q-gram tokenizer
            >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2, n_ltable_chunks=-1, n_rtable_chunks=-1)
            # # Include all possible missing values
            >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True, n_ltable_chunks=-1, n_rtable_chunks=-1)
        """
        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
            "RISK.")

        # Input validations
        self.validate_types_params_tables(ltable, rtable, l_output_attrs,
                                          r_output_attrs, l_output_prefix,
                                          r_output_prefix, verbose, n_ltable_chunks, n_rtable_chunks)
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level, overlap_size)
        self.validate_allow_missing(allow_missing)
        self.validate_show_progress(show_progress)
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs)
        self.validate_word_level_qval(word_level, q_val)

        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose)

        # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose)


        # validate input table chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int,
                             'Parameter n_rtable_chunks')
        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        if n_ltable_chunks == -1:
            n_ltable_chunks = multiprocessing.cpu_count()


        ltable_chunks = pd.np.array_split(ltable, n_ltable_chunks)

        # preprocess/tokenize ltable
        if word_level == True:
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        preprocessed_tokenized_ltbl = []

        # Construct DAG for preprocessing/tokenizing ltable chunks
        start_row_id = 0
        for i in range(len(ltable_chunks)):
            result = delayed(self.process_tokenize_block_attr)(ltable_chunks[i][
                                                                  l_overlap_attr],
                                                              start_row_id,
                                                              rem_stop_words, tokenizer)
            preprocessed_tokenized_ltbl.append(result)
            start_row_id += len(ltable_chunks[i])
        preprocessed_tokenized_ltbl = delayed(wrap)(preprocessed_tokenized_ltbl)

        # Execute the DAG
        if show_progress:
            with ProgressBar():
                logger.info('Preprocessing/tokenizing ltable')
                preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute(
                scheduler="processes", num_workers=multiprocessing.cpu_count())
        else:
            preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute(
                scheduler="processes", num_workers=multiprocessing.cpu_count())

        ltable_processed_dict = {}
        for i in range(len(preprocessed_tokenized_ltbl_vals)):
            ltable_processed_dict.update(preprocessed_tokenized_ltbl_vals[i])

        # build inverted index
        inverted_index = self.build_inverted_index(ltable_processed_dict)

        if n_rtable_chunks == -1:
            n_rtable_chunks = multiprocessing.cpu_count()

        rtable_chunks = pd.np.array_split(rtable, n_rtable_chunks)

        # Construct the DAG for probing
        probe_result = []
        start_row_id = 0
        for i in range(len(rtable_chunks)):
            result = delayed(self.probe)(rtable_chunks[i][r_overlap_attr],
                                         inverted_index, start_row_id, rem_stop_words,
                                         tokenizer, overlap_size)
            probe_result.append(result)
            start_row_id += len(rtable_chunks[i])
        probe_result = delayed(wrap)(probe_result)

        # Execute the DAG for probing
        if show_progress:
            with ProgressBar():
                logger.info('Probing using rtable')
                probe_result = probe_result.compute(scheduler="processes",
                                            num_workers=multiprocessing.cpu_count())
        else:
            probe_result = probe_result.compute(scheduler="processes",
                                                num_workers=multiprocessing.cpu_count())

        # construct a minimal dataframe that can be used to add more attributes
        flat_list = [item for sublist in probe_result for item in sublist]
        tmp = pd.DataFrame(flat_list, columns=['fk_ltable_rid', 'fk_rtable_rid'])
        fk_ltable = ltable.iloc[tmp.fk_ltable_rid][l_key].values
        fk_rtable = rtable.iloc[tmp.fk_rtable_rid][r_key].values
        id_vals = list(range(len(flat_list)))

        candset = pd.DataFrame.from_dict(
            {'_id': id_vals, l_output_prefix+l_key: fk_ltable, r_output_prefix+r_key: fk_rtable})


        # set the properties for the candidate set
        cm.set_key(candset, '_id')
        cm.set_fk_ltable(candset, 'ltable_'+l_key)
        cm.set_fk_rtable(candset, 'rtable_'+r_key)
        cm.set_ltable(candset, ltable)
        cm.set_rtable(candset, rtable)

        ret_candset = gh.add_output_attributes(candset, l_output_attrs=l_output_attrs,
                                               r_output_attrs=r_output_attrs,
                                               l_output_prefix=l_output_prefix,
                                               r_output_prefix=r_output_prefix,
                                               validate=False)



        # handle missing values
        if allow_missing:
            missing_value_pairs = get_pairs_with_missing_value(ltable, rtable, l_key,
                                                           r_key, l_overlap_attr,
                                                           r_overlap_attr,
                                                           l_output_attrs,
                                                           r_output_attrs,
                                                           l_output_prefix,
                                                           r_output_prefix, False, False)
            missing_value_pairs.insert(0, '_id', range(len(ret_candset),
                                                       len(ret_candset)+len(missing_value_pairs)))

            if len(missing_value_pairs) > 0:
                ret_candset = pd.concat([ret_candset, missing_value_pairs], ignore_index=True, sort=False)
                cm.set_key(ret_candset, '_id')
                cm.set_fk_ltable(ret_candset, 'ltable_' + l_key)
                cm.set_fk_rtable(ret_candset, 'rtable_' + r_key)
                cm.set_ltable(ret_candset, ltable)
                cm.set_rtable(ret_candset, rtable)

        # Return the final candidate set to user.
        return ret_candset
Ejemplo n.º 20
0
 def test_edit_dist_qg2_no_padding_empty(self):
     self.test_filter_pair('', '', QgramTokenizer(2, padding=False),
                           'EDIT_DISTANCE', 1, False, False, False)
Ejemplo n.º 21
0
 def setUp(self):
     self.dlm = DelimiterTokenizer(delim_set=[' '], return_set=True)
     self.qg2 = QgramTokenizer(2)
Ejemplo n.º 22
0
def edit_distance_join(ltable,
                       rtable,
                       l_key_attr,
                       r_key_attr,
                       l_join_attr,
                       r_join_attr,
                       threshold,
                       comp_op='<=',
                       allow_missing=False,
                       l_out_attrs=None,
                       r_out_attrs=None,
                       l_out_prefix='l_',
                       r_out_prefix='r_',
                       out_sim_score=True,
                       n_jobs=1,
                       show_progress=True,
                       tokenizer=QgramTokenizer(qval=2)):
    """Join two tables using edit distance measure.

    Finds tuple pairs from left table and right table such that the edit 
    distance between the join attributes satisfies the condition on input 
    threshold. For example, if the comparison operator is '<=', finds tuple     
    pairs whose edit distance between the strings that are the values of    
    the join attributes is less than or equal to the input threshold, as     
    specified in "threshold". 

    Note:
        Currently, this method only computes an approximate join result. This is
        because, to perform the join we transform an edit distance measure 
        between strings into an overlap measure between qgrams of the strings. 
        Hence, we need at least one qgram to be in common between two input 
        strings, to appear in the join output. For smaller strings, where all 
        qgrams of the strings differ, we cannot process them.
 
        This method implements a simplified version of the algorithm proposed in
        `Ed-Join: An Efficient Algorithm for Similarity Joins With Edit Distance
        Constraints (Chuan Xiao, Wei Wang and Xuemin Lin), VLDB 08
        <http://www.vldb.org/pvldb/1/1453957.pdf>`_. 
        
    Args:
        ltable (DataFrame): left input table.

        rtable (DataFrame): right input table.

        l_key_attr (string): key attribute in left table.

        r_key_attr (string): key attribute in right table.

        l_join_attr (string): join attribute in left table.

        r_join_attr (string): join attribute in right table.

        threshold (float): edit distance threshold to be satisfied.        
                                                                                
        comp_op (string): comparison operator. Supported values are '<=', '<'   
            and '=' (defaults to '<=').                                         
                                                                                
        allow_missing (boolean): flag to indicate whether tuple pairs with      
            missing value in at least one of the join attributes should be      
            included in the output (defaults to False). If this flag is set to
            True, a tuple in ltable with missing value in the join attribute 
            will be matched with every tuple in rtable and vice versa. 
                                                                                
        l_out_attrs (list): list of attribute names from the left table to be   
            included in the output table (defaults to None).                    
                                                                                
        r_out_attrs (list): list of attribute names from the right table to be  
            included in the output table (defaults to None).                    
                                                                                
        l_out_prefix (string): prefix to be used for the attribute names coming 
            from the left table, in the output table (defaults to 'l\_').       
                                                                                
        r_out_prefix (string): prefix to be used for the attribute names coming 
            from the right table, in the output table (defaults to 'r\_').      
                                                                                
        out_sim_score (boolean): flag to indicate whether the edit distance 
            score should be included in the output table (defaults to True). 
            Setting this flag to True will add a column named '_sim_score' in 
            the output table. This column will contain the edit distance scores 
            for the tuple pairs in the output.                                          

        n_jobs (int): number of parallel jobs to use for the computation        
            (defaults to 1). If -1 is given, all CPUs are used. If 1 is given,  
            no parallel computing code is used at all, which is useful for      
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used      
            (where n_cpus is the total number of CPUs in the machine). Thus for 
            n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs)    
            becomes less than 1, then no parallel computing code will be used   
            (i.e., equivalent to the default).                                                                                 
                                                                                
        show_progress (boolean): flag to indicate whether task progress should  
            be displayed to the user (defaults to True).                        

        tokenizer (Tokenizer): tokenizer to be used to tokenize the join 
            attributes during filtering, when edit distance measure is          
            transformed into an overlap measure. This must be a q-gram tokenizer
            (defaults to 2-gram tokenizer).
                                                                                
    Returns:                                                                    
        An output table containing tuple pairs that satisfy the join            
        condition (DataFrame).  
    """

    # check if the input tables are dataframes
    validate_input_table(ltable, 'left table')
    validate_input_table(rtable, 'right table')

    # check if the key attributes and join attributes exist
    validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table')
    validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table')
    validate_attr(l_join_attr, ltable.columns, 'join attribute', 'left table')
    validate_attr(r_join_attr, rtable.columns, 'join attribute', 'right table')

    # check if the join attributes are not of numeric type
    validate_attr_type(l_join_attr, ltable[l_join_attr].dtype,
                       'join attribute', 'left table')
    validate_attr_type(r_join_attr, rtable[r_join_attr].dtype,
                       'join attribute', 'right table')

    # check if the input tokenizer is valid for edit distance measure. Only
    # qgram tokenizer can be used for edit distance.
    validate_tokenizer_for_sim_measure(tokenizer, 'EDIT_DISTANCE')

    # check if the input threshold is valid
    validate_threshold(threshold, 'EDIT_DISTANCE')

    # check if the comparison operator is valid
    validate_comp_op_for_sim_measure(comp_op, 'EDIT_DISTANCE')

    # check if the output attributes exist
    validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs,
                          rtable.columns)

    # check if the key attributes are unique and do not contain missing values
    validate_key_attr(l_key_attr, ltable, 'left table')
    validate_key_attr(r_key_attr, rtable, 'right table')

    # convert threshold to integer (incase if it is float)
    threshold = int(floor(threshold))

    # set return_set flag of tokenizer to be False, in case it is set to True
    revert_tokenizer_return_set_flag = False
    if tokenizer.get_return_set():
        tokenizer.set_return_set(False)
        revert_tokenizer_return_set_flag = True

    # remove redundant attrs from output attrs.
    l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr)
    r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr)

    # get attributes to project.
    l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_join_attr)
    r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_join_attr)

    # Do a projection on the input dataframes to keep only the required
    # attributes. Then, remove rows with missing value in join attribute from
    # the input dataframes. Then, convert the resulting dataframes into ndarray.
    ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs,
                                              l_join_attr)
    rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs,
                                              r_join_attr)

    # computes the actual number of jobs to launch.
    n_jobs = min(get_num_processes_to_launch(n_jobs), len(rtable_array))

    if n_jobs <= 1:
        # if n_jobs is 1, do not use any parallel code.
        output_table = _edit_distance_join_split(
            ltable_array, rtable_array, l_proj_attrs, r_proj_attrs, l_key_attr,
            r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold,
            comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix,
            out_sim_score, show_progress)
    else:
        # if n_jobs is above 1, split the right table into n_jobs splits and
        # join each right table split with the whole of left table in a separate
        # process.
        r_splits = split_table(rtable_array, n_jobs)
        results = Parallel(n_jobs=n_jobs)(delayed(_edit_distance_join_split)(
            ltable_array, r_splits[job_index], l_proj_attrs, r_proj_attrs,
            l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer,
            threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix,
            r_out_prefix, out_sim_score, (
                show_progress and (job_index == n_jobs - 1)))
                                          for job_index in range(n_jobs))
        output_table = pd.concat(results)

    # If allow_missing flag is set, then compute all pairs with missing value in
    # at least one of the join attributes and then add it to the output
    # obtained from the join.
    if allow_missing:
        missing_pairs = get_pairs_with_missing_value(
            ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr,
            l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix,
            out_sim_score, show_progress)
        output_table = pd.concat([output_table, missing_pairs])

    # add an id column named '_id' to the output table.
    output_table.insert(0, '_id', range(0, len(output_table)))

    # revert the return_set flag of tokenizer, in case it was modified.
    if revert_tokenizer_return_set_flag:
        tokenizer.set_return_set(True)

    return output_table
def disk_edit_distance_join(ltable, rtable,
                            l_key_attr, r_key_attr,
                            l_join_attr, r_join_attr,
                            threshold, data_limit=1000000,
                            comp_op='<=', allow_missing=False,
                            l_out_attrs=None, r_out_attrs=None,
                            l_out_prefix='l_', r_out_prefix='r_',
                            out_sim_score=True, n_jobs=-1,
                            show_progress=True, tokenizer=QgramTokenizer(qval=2),
                            temp_dir=os.getcwd(), output_file_path=default_output_file_path):

    """
    WARNING: THIS IS AN EXPERIMENTAL COMMAND. THIS COMMAND IS NOT TESTED. 
    USE AT YOUR OWN RISK.

    Join two tables using edit distance measure.

    This is the disk version of the previous edit_distance_join api.
    There can be a scenario that while performing join on large datasets,
    the intermediate in-memory data structures grow very large and thus lead
    to termination of the program due to insufficient memory. Keeping this problem
    in mind, disk_edit_distance_join is the updated version of the older
    edit_distance_join function that solves the above mentioned problem.
    So if the analysis is being done on the machine with small memory limits or
    if the input tables are too large, then this new disk_edit_distance_join can be
    used to avoid memory exceeding problem while processing.


    It Finds tuple pairs from left table and right table such that the edit
    distance between the join attributes satisfies the condition on input
    threshold. For example, if the comparison operator is '<=', finds tuple
    pairs whose edit distance between the strings that are the values of
    the join attributes is less than or equal to the input threshold, as
    specified in "threshold".

    Note:
        Currently, this method only computes an approximate join result. This is
        because, to perform the join we transform an edit distance measure
        between strings into an overlap measure between qgrams of the strings.
        Hence, we need at least one qgram to be in common between two input
        strings, to appear in the join output. For smaller strings, where all
        qgrams of the strings differ, we cannot process them.

        This method implements a simplified version of the algorithm proposed in
        `Ed-Join: An Efficient Algorithm for Similarity Joins With Edit Distance
        Constraints (Chuan Xiao, Wei Wang and Xuemin Lin), VLDB 08
        <http://www.vldb.org/pvldb/1/1453957.pdf>`_.

    Args:
        ltable (DataFrame): left input table.

        rtable (DataFrame): right input table.

        l_key_attr (string): key attribute in left table.

        r_key_attr (string): key attribute in right table.

        l_join_attr (string): join attribute in left table.

        r_join_attr (string): join attribute in right table.

        threshold (float): edit distance threshold to be satisfied.

        data_limit (int): threshold value for number of rows that would be kept
            in memory before writing the output on the disk. This is the maximum sum
            total of all rows that can be present in memory across all processes at
            a time. This is a new argument compared to edit distance join.
            (defaults to 1M)

        comp_op (string): comparison operator. Supported values are '<=', '<'
            and '=' (defaults to '<=').

        allow_missing (boolean): flag to indicate whether tuple pairs with
            missing value in at least one of the join attributes should be
            included in the output (defaults to False). If this flag is set to
            True, a tuple in ltable with missing value in the join attribute
            will be matched with every tuple in rtable and vice versa.

        l_out_attrs (list): list of attribute names from the left table to be
            included in the output table (defaults to None).

        r_out_attrs (list): list of attribute names from the right table to be
            included in the output table (defaults to None).

        l_out_prefix (string): prefix to be used for the attribute names coming
            from the left table, in the output table (defaults to 'l\_').

        r_out_prefix (string): prefix to be used for the attribute names coming
            from the right table, in the output table (defaults to 'r\_').

        out_sim_score (boolean): flag to indicate whether the edit distance
            score should be included in the output table (defaults to True).
            Setting this flag to True will add a column named '_sim_score' in
            the output table. This column will contain the edit distance scores
            for the tuple pairs in the output.

        n_jobs (int): number of parallel jobs to use for the computation
            (defaults to 1). If -1 is given, all CPUs are used. If 1 is given,
            no parallel computing code is used at all, which is useful for
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used
            (where n_cpus is the total number of CPUs in the machine). Thus for
            n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs)
            becomes less than 1, then no parallel computing code will be used
            (i.e., equivalent to the default).

        show_progress (boolean): flag to indicate whether task progress should
            be displayed to the user (defaults to True).

        tokenizer (Tokenizer): tokenizer to be used to tokenize the join
            attributes during filtering, when edit distance measure is
            transformed into an overlap measure. This must be a q-gram tokenizer
            (defaults to 2-gram tokenizer).

        temp_dir (string): absolute path where all the intermediate files will be generated.
            This is a new argument compared to edit distance join. (defaults to the current
            working directory).

        output_file_path (string): absolute path where the output file will be generated.
            Older file with same path and name will be removed. This is a new argument compared
            to edit distance join. (defaults to the current working directory/$default_output_file_name).

    Returns:
        Returns the status of the computation. True if successfully completed else False (boolean).
    """

    from py_stringsimjoin import __use_cython__ 
    if __use_cython__:
        from py_stringsimjoin.join.disk_edit_distance_join_cy import disk_edit_distance_join_cy
        return disk_edit_distance_join_cy(ltable, rtable,
                                          l_key_attr, r_key_attr,
                                          l_join_attr, r_join_attr,
                                          threshold, data_limit,
                                          comp_op,
                                          allow_missing,
                                          l_out_attrs, r_out_attrs,
                                          l_out_prefix, r_out_prefix,
                                          out_sim_score, n_jobs,
                                          show_progress,tokenizer,
                                          temp_dir, output_file_path)
    else:
        raise AssertionError('Cython not installed.')
    def test_apply_matcher_with_allow_missing(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(
            lambda row: sim_func(tok.tokenize(str(row[self.l_join_attr])),
                                 tok.tokenize(str(row[self.r_join_attr]))),
            axis=1)

        # compute expected output pairs
        comp_fn = COMP_OP_MAP[comp_op]
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join(
                    (str(row[self.l_key_attr]), str(row[self.r_key_attr]))))

        # find pairs that need to be included in output due to
        # the presence of missing value in one of the join attributes.
        missing_pairs = set()
        for l_idx, l_row in self.orig_ltable.iterrows():
            for r_idx, r_row in self.orig_rtable.iterrows():
                if (pd.isnull(l_row[self.l_join_attr])
                        or pd.isnull(r_row[self.r_join_attr])):
                    missing_pairs.add(','.join((str(l_row[self.l_key_attr]),
                                                str(r_row[self.r_key_attr]))))

        # add the pairs containing missing value to the set of expected pairs.
        expected_pairs = expected_pairs.union(missing_pairs)

        # use overlap filter to obtain a candset with allow_missing set to True.
        overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True)
        candset = overlap_filter.filter_tables(
            self.orig_ltable, self.orig_rtable, self.l_key_attr,
            self.r_key_attr, self.l_join_attr, self.r_join_attr)

        # apply a jaccard matcher to the candset with allow_missing set to True.
        output_candset = apply_matcher(candset,
                                       DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                                       DEFAULT_R_OUT_PREFIX + self.r_key_attr,
                                       self.orig_ltable,
                                       self.orig_rtable,
                                       self.l_key_attr,
                                       self.r_key_attr,
                                       self.l_join_attr,
                                       self.r_join_attr,
                                       tok,
                                       sim_func,
                                       threshold,
                                       comp_op,
                                       True,
                                       out_sim_score=True)

        expected_output_attrs = [
            '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr,
            DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score'
        ]

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join(
                (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                 str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
def test_edit_distance_join():
    # data to be tested.
    test_scenario_1 = [('data/table_A.csv', 'A.ID', 'A.name'),
                       ('data/table_B.csv', 'B.ID', 'B.name')]
    data = {'TEST_SCENARIO_1': test_scenario_1}

    # edit distance thresholds to be tested.
    thresholds = [1, 2, 3, 4, 8, 9]

    # tokenizers to be tested.
    tokenizers = {
        '2_GRAM': QgramTokenizer(qval=2),
        '3_GRAM': QgramTokenizer(qval=3)
    }

    # comparison operators to be tested.
    comp_ops = ['<=', '<', '=']

    sim_measure_type = 'EDIT_DISTANCE'
    # Test each combination of threshold and tokenizer
    # for different test scenarios.
    for label, scenario in iteritems(data):
        for threshold in thresholds:
            for tok_type, tok in iteritems(tokenizers):
                for comp_op in comp_ops:
                    test_function = partial(test_valid_join, scenario, tok,
                                            threshold, comp_op)
                    test_function.description = 'Test ' + sim_measure_type + \
                        ' with ' + str(threshold) + ' threshold and ' + \
                        tok_type + ' tokenizer for ' + label + '.'
                    yield test_function,

    # Test with allow_missing flag set to True.
    test_function = partial(
        test_valid_join, test_scenario_1, tokenizers['2_GRAM'], 9, '<=',
        (True, ['A.birth_year', 'A.zipcode'], ['B.name', 'B.zipcode']))
    test_function.description = 'Test ' + sim_measure_type + \
                                ' with allow_missing set to True.'
    yield test_function,

    # Test with output attributes added.
    test_function = partial(test_valid_join, test_scenario_1,
                            tokenizers['2_GRAM'], 9, '<=',
                            (False, ['A.ID', 'A.birth_year', 'A.zipcode'
                                     ], ['B.ID', 'B.name', 'B.zipcode']))
    test_function.description = 'Test ' + sim_measure_type + \
                                ' with output attributes.'
    yield test_function,

    # Test with a different output prefix.
    test_function = partial(test_valid_join, test_scenario_1,
                            tokenizers['2_GRAM'], 9, '<=',
                            (False, ['A.birth_year', 'A.zipcode'],
                             ['B.name', 'B.zipcode'], 'ltable.', 'rtable.'))
    test_function.description = 'Test ' + sim_measure_type + \
                                ' with output attributes and prefix.'
    yield test_function,

    # Test with output_sim_score disabled.
    test_function = partial(
        test_valid_join, test_scenario_1, tokenizers['2_GRAM'], 9, '<=',
        (False, ['A.birth_year', 'A.zipcode'
                 ], ['B.name', 'B.zipcode'], 'ltable.', 'rtable.', False))
    test_function.description = 'Test ' + sim_measure_type + \
                                ' with sim_score disabled.'
    yield test_function,

    # Test with n_jobs above 1.
    test_function = partial(
        test_valid_join, test_scenario_1, tokenizers['2_GRAM'], 9, '<=',
        (False, ['A.birth_year', 'A.zipcode'
                 ], ['B.name', 'B.zipcode'], 'ltable.', 'rtable.', False, 2))
    test_function.description = 'Test ' + sim_measure_type + \
                                ' with n_jobs above 1.'
    yield test_function,

    # scenario where join attributes are of type int
    test_scenario_2 = [(os.sep.join(['data',
                                     'table_A.csv']), 'A.ID', 'A.zipcode'),
                       (os.sep.join(['data',
                                     'table_B.csv']), 'B.ID', 'B.zipcode')]

    # Test with join attribute of type int.
    test_function = partial(test_valid_join, test_scenario_2,
                            tokenizers['2_GRAM'], 3, '<=', (), True)
    test_function.description = 'Test ' + sim_measure_type + \
                                ' with join attribute of type int.'
    yield test_function,

    # scenario where join attributes are of type float
    test_scenario_3 = [(os.sep.join(['data',
                                     'table_A.csv']), 'A.ID', 'A.hourly_wage'),
                       (os.sep.join(['data',
                                     'table_B.csv']), 'B.ID', 'B.hourly_wage')]

    # Test with join attribute of type float.
    test_function = partial(test_valid_join, test_scenario_3,
                            tokenizers['2_GRAM'], 3, '<=', (), True)
    test_function.description = 'Test ' + sim_measure_type + \
                                ' with join attribute of type float.'
    yield test_function,

    # Test with a tokenizer where return_set flag is set to True.
    tok = QgramTokenizer(2, return_set=True)
    test_function = partial(test_valid_join, test_scenario_1, tok, 9)
    test_function.description = 'Test ' + sim_measure_type + \
                        ' with a tokenizer where return_set flag is set to True'
    yield test_function,
    def block_candset(self, candset, l_overlap_attr, r_overlap_attr,
                      rem_stop_words=False, q_val=None, word_level=True,
                      overlap_size=1, allow_missing=False,
                      verbose=False, show_progress=True, n_chunks=-1):

        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
        
        Blocks an input candidate set of tuple pairs based on the overlap
        of token sets of attribute values. Finds tuple pairs from an input 
        candidate set of tuple pairs such that
        the overlap between (a) the set of tokens obtained by tokenizing the
        value of attribute l_overlap_attr of the left tuple in a tuple pair,
        and (b) the set of tokens obtained by tokenizing the value of
        attribute r_overlap_attr of the right tuple in the tuple pair,
        is above a certain threshold.

        Args:
            candset (DataFrame): The input candidate set of tuple pairs.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
                                      (e.g., a, an, the) should be removed
                                      from the token sets of the overlap
                                      attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes values
                         are to be tokenized as qgrams (defaults to None).

            word_level (boolean): A flag to indicate whether the overlap
                                  attributes should be tokenized as words
                                  (i.e, using whitespace as delimiter)
                                  (defaults to True).

            overlap_size (int): The minimum number of tokens that must overlap
                                (defaults to 1).

            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple pair with missing value in either
                                     blocking attribute will be retained in the
                                     output candidate set.

            verbose (boolean): A flag to indicate whether the debug information

                should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).

            n_chunks (int): The number of partitions to split the candidate set. If it 
                            is set to -1, the number of partitions will be set to the 
                            number of cores in the machine.  

        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `l_overlap_attr` is not of type string.
            AssertionError: If `r_overlap_attr` is not of type string.
            AssertionError: If `q_val` is not of type int.
            AssertionError: If `word_level` is not of type boolean.
            AssertionError: If `overlap_size` is not of type int.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `show_progress` is not of type
                boolean.
            AssertionError: If `n_chunks` is not of type
                int.
            AssertionError: If `l_overlap_attr` is not in the ltable
                columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.
            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.
        Examples:
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = DaskOverlapBlocker()
            >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'])

            >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Include all possible tuple pairs with missing values
            >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Execute blocking using multiple cores
            >>> D3 = ob.block_candset(C, 'name', 'name', n_chunks=-1)
            # Use q-gram tokenizer
            >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2)


        """
        logger.warning(
            "WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
            "RISK.")

        # Validate input parameters
        self.validate_types_params_candset(candset, verbose, show_progress, n_chunks)
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level, overlap_size)

        # get and validate metadata
        log_info(logger,
                 'Required metadata: cand.set key, fk ltable, fk rtable, '
                 'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key,
                                          logger, verbose)

        # validate overlap attrs
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr,
                                    r_overlap_attr)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # validate number of chunks
        validate_object_type(n_chunks, int, 'Parameter n_chunks')
        validate_chunks(n_chunks)


        # # do projection before merge
        l_df = ltable[[l_key, l_overlap_attr]]
        r_df = rtable[[r_key, r_overlap_attr]]

        # # set index for convenience
        l_df = l_df.set_index(l_key, drop=False)
        r_df = r_df.set_index(r_key, drop=False)

        # # case the overlap attribute to string if required.
        l_df.is_copy, r_df.is_copy = False, False  # to avoid setwithcopy warning
        ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True)
        ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True)

        if word_level == True:
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            tokenizer = QgramTokenizer(return_set=True)


        n_chunks = get_num_partitions(n_chunks, len(candset))
        c_splits = pd.np.array_split(candset, n_chunks)
        valid_splits = []

        # Create DAG
        for i in range(n_chunks):
            result = delayed(self._block_candset_split)(c_splits[i], l_df, r_df, l_key,
                                                       r_key, l_overlap_attr,
                                                       r_overlap_attr, fk_ltable,
                                                       fk_rtable, allow_missing,
                                                       rem_stop_words, tokenizer, overlap_size)
            valid_splits.append(result)
        valid_splits = delayed(wrap)(valid_splits)

        # Execute the DAG
        if show_progress:
            with ProgressBar():
                valid_splits = valid_splits.compute(scheduler="processes",
                                                    num_workers=get_num_cores())
        else:
            valid_splits = valid_splits.compute(scheduler="processes",
                                                num_workers=get_num_cores())

        valid = sum(valid_splits, [])

        # construct output table
        if len(candset) > 0:
            out_table = candset[valid]
        else:
            out_table = pd.DataFrame(columns=candset.columns)

        # update the catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable,
                                  ltable, rtable)

        # return the output table
        return out_table
def tuner_overlap_blocker(ltable,
                          rtable,
                          l_key,
                          r_key,
                          l_overlap_attr,
                          r_overlap_attr,
                          rem_stop_words,
                          q_val,
                          word_level,
                          overlap_size,
                          ob_obj,
                          n_bins=50,
                          sample_proportion=0.1,
                          seed=0,
                          repeat=1):
    """
    WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
    
    Tunes the parameters for blocking two tables command implemented using Dask. 

    Given the input tables and the parameters for Dask-based overlap blocker command, 
    this command returns the configuration including whether the input tables need to 
    be swapped, the number of left table chunks, and the number of right table chunks. 
    It uses "Staged Tuning" approach to select the configuration setting. The key idea 
    of this approach select the configuration for one parameter at a time.
    
    Conceptually, this command performs the following steps. First, it samples the 
    left table and rtable using stratified sampling. Next, it uses the 
    sampled tables to decide if they need to be swapped or not (by running the down 
    sample command and comparing the runtimes). Next, it finds the number of rtable 
    partitions using the sampled tables (by trying the a fixed set of partitions and 
    comparing the runtimes). The number of partitions is selected to be the number 
    before which the runtime starts increasing. Then it finds the number of right table 
    partitions similar to selecting the number of left table partitions. while doing 
    this, set the number of right table partitions is set to the value found in the 
    previous step. Finally, it returns the configuration setting back to the user as a 
    triplet (x, y, z) where x indicates if the tables need to be swapped or not, 
    y indicates the number of left table partitions (if the tables need to be swapped, 
    then this indicates the number of left table partitions after swapping), 
    and z indicates the number of right table partitions. 
    
    Args:        
        ltable (DataFrame): The left input table.

        rtable (DataFrame): The right input table.

        l_overlap_attr (string): The overlap attribute in left table.

        r_overlap_attr (string): The overlap attribute in right table.

        rem_stop_words (boolean): A flag to indicate whether stop words
             (e.g., a, an, the) should be removed from the token sets of the
             overlap attribute values (defaults to False).

        q_val (int): The value of q to use if the overlap attributes
             values are to be tokenized as qgrams (defaults to None).

        word_level (boolean): A flag to indicate whether the overlap
             attributes should be tokenized as words (i.e, using whitespace
             as delimiter) (defaults to True).
            
        overlap_size (int):  The minimum number of tokens that must overlap.
     
        ob_obj (OverlapBlocker): The object used to call commands to block two tables 
            and a candidate set

        n_bins (int): The number of bins to be used for stratified sampling.
        sample_proportion (float): The proportion used to sample the tables. This value
            is expected to be greater than 0 and less thank 1.
        repeat (int): The number of times to execute the down sample command while 
            selecting the values for the parameters.
    
    Returns:
        A tuple containing 3 values. For example if the tuple is represented as (x, y, 
        z) then x indicates if the tables need to be swapped or not, y indicates the number of 
        left table partitions (if the tables need to be swapped, then this indicates the 
        number of left table partitions after swapping), and z indicates the number of 
        right table partitions. 
       
    Examples:
        >>> from py_entitymatching.tuner.tuner_overlap_blocker import tuner_overlap_blocker
        >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker
        >>> obj = DaskOverlapBlocker()
        >>> (swap_or_not, n_ltable_chunks, n_sample_rtable_chunks) = tuner_overlap_blocker(ltable, rtable, 'id', 'id', "title", "title", rem_stop_words=True, q_val=None, word_level=True, overlap_size=1, ob_obj=obj)
        """
    logger.warning(
        "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
        "RISK.")

    # Select the tokenizer
    if word_level:
        tokenizer = WhitespaceTokenizer()
    else:
        tokenizer = QgramTokenizer()

    # Same the input tables, given in the original order
    sampled_tables_orig_order = get_sampled_tables(
        ltable, rtable, l_key, r_key, l_overlap_attr, r_overlap_attr,
        rem_stop_words, tokenizer, ob_obj, n_bins, sample_proportion, seed)

    # Same the input tables, given in the swapped order
    sampled_tables_swap_order = get_sampled_tables(
        rtable, ltable, r_key, l_key, r_overlap_attr, l_overlap_attr,
        rem_stop_words, tokenizer, ob_obj, n_bins, sample_proportion, seed)

    #  Select if the tables need to be swapped
    swap_config = should_swap(ob_obj, sampled_tables_orig_order,
                              sampled_tables_swap_order, l_overlap_attr,
                              r_overlap_attr, rem_stop_words, q_val,
                              word_level, overlap_size, repeat)
    # Use the sampled tables
    s_ltable, s_rtable = sampled_tables_orig_order
    if swap_config == True:
        s_ltable, s_rtable = sampled_tables_swap_order

    # Find the number of right table partitions
    n_rtable_chunks = find_rtable_chunks(ob_obj, s_ltable, s_rtable,
                                         l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level,
                                         overlap_size)

    # Find the number of left table partitions
    n_ltable_chunks = find_ltable_chunks(ob_obj, s_ltable, s_rtable,
                                         l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level,
                                         overlap_size, n_rtable_chunks)

    # Return the configuration
    return (swap_config, n_ltable_chunks, n_rtable_chunks)
Ejemplo n.º 28
0
    def _apply_filterable_rule(self, rule_name, ltable, rtable, l_key, r_key):
        candset = None
        conjunct_list = self.rule_str[rule_name]
        for conjunct in conjunct_list:
            is_auto_gen, sim_fn, l_attr, r_attr, l_tok, r_tok, op, \
            th = self._parse_conjunct(
                conjunct, rule_name)

            if l_tok == 'dlm_dc0':
                tokenizer = WhitespaceTokenizer(return_set=True)
            elif l_tok == 'qgm_3':
                tokenizer = QgramTokenizer(qval=3, return_set=True)

            if sim_fn == 'jaccard':
                join_fn = ssj.jaccard_join
            elif sim_fn == 'cosine':
                join_fn = ssj.cosine_join
            elif sim_fn == 'dice':
                join_fn = ssj.dice_join
            elif sim_fn == 'overlap_coeff':
                join_fn = ssj.overlap_coefficient_join
            elif sim_fn == 'lev_dist':
                join_fn = ssj.edit_distance_join

            if join_fn == ssj.edit_distance_join:
                comp_op = '<='
                if op == '>=':
                    comp_op = '<'
            else:
                comp_op = '>='
                if op == '<=':
                    comp_op = '>'

            ssj.dataframe_column_to_str(ltable, l_attr, inplace=True)
            ssj.dataframe_column_to_str(rtable, r_attr, inplace=True)

            if join_fn == ssj.edit_distance_join:
                tokenizer = QgramTokenizer(qval=2, return_set=False)
                c_df = join_fn(
                    ltable,
                    rtable,
                    l_key,
                    r_key,
                    l_attr,
                    r_attr,
                    float(th),
                    comp_op,
                    allow_missing=True,
                    # need to revisit allow_missing
                    out_sim_score=False,
                    l_out_prefix='l_',
                    r_out_prefix='r_',
                    show_progress=False,
                    tokenizer=tokenizer)
            else:
                c_df = join_fn(ltable,
                               rtable,
                               l_key,
                               r_key,
                               l_attr,
                               r_attr,
                               tokenizer,
                               float(th),
                               comp_op,
                               allow_empty=True,
                               allow_missing=True,
                               l_out_prefix='l_',
                               r_out_prefix='r_',
                               out_sim_score=False)
                #c_df.drop('_id', axis=1)
            if candset is not None:
                # union the candset of this conjunct with the existing candset
                candset = pd.concat([candset, c_df]).drop_duplicates(
                    [l_output_prefix + l_key,
                     r_output_prefix + r_key]).reset_index(drop=True)
            else:
                # candset from the first conjunct of the rule
                candset = c_df
        return candset
Ejemplo n.º 29
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_overlap_attr,
                     r_overlap_attr,
                     rem_stop_words=False,
                     q_val=None,
                     word_level=True,
                     overlap_size=1,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='ltable_',
                     r_output_prefix='rtable_',
                     allow_missing=False,
                     verbose=False,
                     show_progress=True,
                     n_jobs=1):
        """
        Blocks two tables based on the overlap of token sets of attribute
         values.

        Finds tuple pairs from left and right tables such that the overlap
        between (a) the set of tokens obtained by tokenizing the value of
        attribute l_overlap_attr of a tuple from the left table, and (b) the
        set of tokens obtained by tokenizing the value of attribute
        r_overlap_attr of a tuple from the right table, is above a certain
        threshold.

        Args:
            ltable (DataFrame): The left input table.

            rtable (DataFrame): The right input table.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
             (e.g., a, an, the) should be removed from the token sets of the
             overlap attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes
             values are to be tokenized as qgrams (defaults to None).

            word_level (boolean): A flag to indicate whether the overlap
             attributes should be tokenized as words (i.e, using whitespace
             as delimiter) (defaults to True).

            overlap_size (int): The minimum number of tokens that must
             overlap (defaults to 1).
            l_output_attrs (list): A list of attribute names from the left
                table to be included in the output candidate set (defaults
                to None).
            r_output_attrs (list): A list of attribute names from the right
                table to be included in the output candidate set  (defaults
                to None).

            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple in ltable with missing value in the
                                     blocking attribute will be matched with
                                     every tuple in rtable and vice versa.

            verbose (boolean): A flag to indicate whether the debug
                information should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                be displayed to the user (defaults to True).

            n_jobs (int): The number of parallel jobs to be used for computation
                (defaults to 1). If -1 all CPUs are used. If 0 or 1,
                no parallel computation is used at all, which is useful for
                debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
                used (where n_cpus is the total number of CPUs in the
                machine). Thus, for n_jobs = -2, all CPUs but one are used.
                If (n_cpus + 1 + n_jobs) is less than 1, then no parallel
                computation is used (i.e., equivalent to the default).


        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.

            AssertionError: If `rtable` is not of type pandas
                DataFrame.

            AssertionError: If `l_overlap_attr` is not of type string.

            AssertionError: If `r_overlap_attr` is not of type string.

            AssertionError: If `l_output_attrs` is not of type of
             list.

            AssertionError: If `r_output_attrs` is not of type of
             list.

            AssertionError: If the values in `l_output_attrs` is not of type
             string.

            AssertionError: If the values in `r_output_attrs` is not of type
             string.

            AssertionError: If `l_output_prefix` is not of type
             string.

            AssertionError: If `r_output_prefix` is not of type
             string.

            AssertionError: If `q_val` is not of type int.

            AssertionError: If `word_level` is not of type boolean.

            AssertionError: If `overlap_size` is not of type int.

            AssertionError: If `verbose` is not of type
             boolean.

            AssertionError: If `allow_missing` is not of type boolean.

            AssertionError: If `show_progress` is not of type
             boolean.

            AssertionError: If `n_jobs` is not of type
             int.

            AssertionError: If `l_overlap_attr` is not in the ltable
             columns.

            AssertionError: If `r_block_attr` is not in the rtable columns.

            AssertionError: If `l_output_attrs` are not in the ltable.

            AssertionError: If `r_output_attrs` are not in the rtable.

            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.

            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.

        Examples:
            >>> import py_entitymatching as em
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = em.OverlapBlocker()
            # Use word-level tokenizer
            >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1)
            # Use q-gram tokenizer
            >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2)
            # Include all possible missing values
            >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True)
            # Use all the cores in the machine
            >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], n_jobs=-1)


        """

        # validate data types of standard input parameters
        self.validate_types_params_tables(ltable, rtable, l_output_attrs,
                                          r_output_attrs, l_output_prefix,
                                          r_output_prefix, verbose, n_jobs)

        # validate data types of input parameters specific to overlap blocker
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level,
                                         overlap_size)

        # validate data type of allow_missing
        self.validate_allow_missing(allow_missing)

        # validate data type of show_progress
        self.validate_show_progress(show_progress)

        # validate overlap attributes
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr,
                                    r_overlap_attr)

        # validate output attributes
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # get and validate required metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # do blocking

        # # do projection before merge
        l_proj_attrs = self.get_attrs_to_project(l_key, l_overlap_attr,
                                                 l_output_attrs)
        l_df = ltable[l_proj_attrs]
        r_proj_attrs = self.get_attrs_to_project(r_key, r_overlap_attr,
                                                 r_output_attrs)
        r_df = rtable[r_proj_attrs]

        # # case the column to string if required.
        l_df.is_copy, r_df.is_copy = False, False  # to avoid setwithcopy warning
        ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True)
        ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True)

        # # cleanup the tables from non-ascii characters, punctuations, and stop words
        l_dummy_overlap_attr = '@#__xx__overlap_ltable__#@'
        r_dummy_overlap_attr = '@#__xx__overlap_rtable__#@'
        l_df[l_dummy_overlap_attr] = l_df[l_overlap_attr]
        r_df[r_dummy_overlap_attr] = r_df[r_overlap_attr]

        if not l_df.empty:
            self.cleanup_table(l_df, l_dummy_overlap_attr, rem_stop_words)
        if not r_df.empty:
            self.cleanup_table(r_df, r_dummy_overlap_attr, rem_stop_words)

        # # determine which tokenizer to use
        if word_level == True:
            # # # create a whitespace tokenizer
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            # # # create a qgram tokenizer
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        # # perform overlap similarity join
        candset = overlap_join(l_df, r_df, l_key, r_key, l_dummy_overlap_attr,
                               r_dummy_overlap_attr, tokenizer, overlap_size,
                               '>=', allow_missing, l_output_attrs,
                               r_output_attrs, l_output_prefix,
                               r_output_prefix, False, n_jobs, show_progress)

        # # retain only the required attributes in the output candidate set
        retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs,
                                               r_output_attrs, l_output_prefix,
                                               r_output_prefix)
        candset = candset[retain_cols]

        # update metadata in the catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return the candidate set
        return candset
Ejemplo n.º 30
0
    def block_candset(self,
                      candset,
                      l_overlap_attr,
                      r_overlap_attr,
                      rem_stop_words=False,
                      q_val=None,
                      word_level=True,
                      overlap_size=1,
                      allow_missing=False,
                      verbose=False,
                      show_progress=True,
                      n_jobs=1):
        """Blocks an input candidate set of tuple pairs based on the overlap
           of token sets of attribute values.

        Finds tuple pairs from an input candidate set of tuple pairs such that
        the overlap between (a) the set of tokens obtained by tokenizing the
        value of attribute l_overlap_attr of the left tuple in a tuple pair,
        and (b) the set of tokens obtained by tokenizing the value of
        attribute r_overlap_attr of the right tuple in the tuple pair,
        is above a certain threshold.

        Args:
            candset (DataFrame): The input candidate set of tuple pairs.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
                                      (e.g., a, an, the) should be removed
                                      from the token sets of the overlap
                                      attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes values
                         are to be tokenized as qgrams (defaults to None).
 
            word_level (boolean): A flag to indicate whether the overlap
                                  attributes should be tokenized as words
                                  (i.e, using whitespace as delimiter)
                                  (defaults to True).

            overlap_size (int): The minimum number of tokens that must overlap
                                (defaults to 1).

            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple pair with missing value in either
                                     blocking attribute will be retained in the
                                     output candidate set.

            verbose (boolean): A flag to indicate whether the debug information


                should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).

            n_jobs (int): The number of parallel jobs to be used for computation
                (defaults to 1). If -1 all CPUs are used. If 0 or 1,
                no parallel computation is used at all, which is useful for
                debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
                used (where n_cpus are the total number of CPUs in the
                machine).Thus, for n_jobs = -2, all CPUs but one are used.
                If (n_cpus + 1 + n_jobs) is less than 1, then no parallel
                computation is used (i.e., equivalent to the default).

        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `l_overlap_attr` is not of type string.
            AssertionError: If `r_overlap_attr` is not of type string.
            AssertionError: If `q_val` is not of type int.
            AssertionError: If `word_level` is not of type boolean.
            AssertionError: If `overlap_size` is not of type int.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `show_progress` is not of type
                boolean.
            AssertionError: If `n_jobs` is not of type
                int.
            AssertionError: If `l_overlap_attr` is not in the ltable
                columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.
            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.
        Examples:
            >>> import py_entitymatching as em
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = em.OverlapBlocker()
            >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'])

            >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Include all possible tuple pairs with missing values
            >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Execute blocking using multiple cores
            >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1)
            # Use q-gram tokenizer
            >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2)


        """

        # validate data types of standard input parameters
        self.validate_types_params_candset(candset, verbose, show_progress,
                                           n_jobs)

        # validate data types of input parameters specific to overlap blocker
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level,
                                         overlap_size)

        # get and validate metadata
        log_info(
            logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
            'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key, logger,
                                          verbose)

        # validate overlap attrs
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr,
                                    r_overlap_attr)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # do blocking

        # # do projection before merge
        l_df = ltable[[l_key, l_overlap_attr]]
        r_df = rtable[[r_key, r_overlap_attr]]

        # # case the overlap attribute to string if required.
        l_df.is_copy, r_df.is_copy = False, False  # to avoid setwithcopy warning
        ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True)
        ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True)

        # # cleanup the tables from non-ascii characters, punctuations, and stop words
        self.cleanup_table(l_df, l_overlap_attr, rem_stop_words)
        self.cleanup_table(r_df, r_overlap_attr, rem_stop_words)

        # # determine which tokenizer to use
        if word_level == True:
            # # # create a whitespace tokenizer
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            # # # create a qgram tokenizer
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        # # create a filter for overlap similarity join
        overlap_filter = OverlapFilter(tokenizer,
                                       overlap_size,
                                       allow_missing=allow_missing)

        # # perform overlap similarity filtering of the candset
        out_table = overlap_filter.filter_candset(candset,
                                                  fk_ltable,
                                                  fk_rtable,
                                                  l_df,
                                                  r_df,
                                                  l_key,
                                                  r_key,
                                                  l_overlap_attr,
                                                  r_overlap_attr,
                                                  n_jobs,
                                                  show_progress=show_progress)
        # update catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable,
                                  rtable)

        # return candidate set
        return out_table
Ejemplo n.º 31
0
def test_set_sim_join():
    # data to be tested.
    test_scenario_1 = [(os.sep.join(['data',
                                     'table_A.csv']), 'A.ID', 'A.name'),
                       (os.sep.join(['data',
                                     'table_B.csv']), 'B.ID', 'B.name')]
    data = {'TEST_SCENARIO_1': test_scenario_1}

    # similarity measures to be tested.
    sim_measure_types = ['COSINE', 'DICE', 'JACCARD', 'OVERLAP_COEFFICIENT']

    # similarity thresholds to be tested.
    thresholds = {
        'JACCARD': [0.3, 0.5, 0.7, 0.85, 1],
        'COSINE': [0.3, 0.5, 0.7, 0.85, 1],
        'DICE': [0.3, 0.5, 0.7, 0.85, 1],
        'OVERLAP_COEFFICIENT': [0.3, 0.5, 0.7, 0.85, 1]
    }

    # tokenizers to be tested.
    tokenizers = {
        'SPACE_DELIMITER': DelimiterTokenizer(delim_set=[' '],
                                              return_set=True),
        '2_GRAM': QgramTokenizer(qval=2, return_set=True),
        '3_GRAM': QgramTokenizer(qval=3, return_set=True)
    }

    # Test each combination of similarity measure, threshold and tokenizer
    # for different test scenarios.
    for label, scenario in iteritems(data):
        for sim_measure_type in sim_measure_types:
            for threshold in thresholds.get(sim_measure_type):
                for tok_type, tok in iteritems(tokenizers):
                    test_function = partial(test_valid_join, scenario,
                                            sim_measure_type, (tok, threshold))
                    test_function.description = 'Test ' + sim_measure_type + \
                        ' with ' + str(threshold) + ' threshold and ' + \
                        tok_type + ' tokenizer for ' + label + '.'
                    yield test_function,

# Test each similarity measure with different comparison operators.
    for sim_measure_type in sim_measure_types:
        for comp_op in ['>', '=']:
            test_function = partial(
                test_valid_join, test_scenario_1, sim_measure_type,
                (tokenizers['SPACE_DELIMITER'], 0.3, comp_op, False))
            test_function.description = 'Test ' + sim_measure_type + \
                                        ' with comp_op ' + comp_op + '.'
            yield test_function,

    # Test each similarity measure with allow_missing set to True.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 0.7, '>=', False, True))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with allow_missing set to True.'
        yield test_function,

    # Test each similarity measure with output attributes added.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 0.3, '>=', False, False, [
                'A.ID', 'A.birth_year', 'A.zipcode'
            ], ['B.ID', 'B.name', 'B.zipcode']))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with output attributes.'
        yield test_function,

    # Test each similarity measure with a different output prefix.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 0.7, '>=', False, False, [
                'A.birth_year', 'A.zipcode'
            ], ['B.name', 'B.zipcode'], 'ltable.', 'rtable.'))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with output attributes and prefix.'
        yield test_function,

    # Test each similarity measure with output_sim_score disabled.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 0.7, '>=', False, False, [
                'A.birth_year', 'A.zipcode'
            ], ['B.name', 'B.zipcode'], 'ltable.', 'rtable.', False))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with sim_score disabled.'
        yield test_function,

    # Test each similarity measure with n_jobs above 1.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 0.3, '>=', False, False, [
                'A.birth_year', 'A.zipcode'
            ], ['B.name', 'B.zipcode'], 'ltable.', 'rtable.', False, 2))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with n_jobs above 1.'
        yield test_function,

    # scenario where join attributes are of type int
    test_scenario_2 = [(os.sep.join(['data',
                                     'table_A.csv']), 'A.ID', 'A.zipcode'),
                       (os.sep.join(['data',
                                     'table_B.csv']), 'B.ID', 'B.zipcode')]

    # Test each similarity measure with join attribute of type int.
    for sim_measure_type in sim_measure_types:
        test_function = partial(test_valid_join, test_scenario_2,
                                sim_measure_type, (tokenizers['2_GRAM'], 0.3),
                                True)
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with join attribute of type int.'
        yield test_function,

    # scenario where join attributes are of type float
    test_scenario_3 = [(os.sep.join(['data',
                                     'table_A.csv']), 'A.ID', 'A.hourly_wage'),
                       (os.sep.join(['data',
                                     'table_B.csv']), 'B.ID', 'B.hourly_wage')]

    # Test each similarity measure with join attribute of type float.
    for sim_measure_type in sim_measure_types:
        test_function = partial(test_valid_join, test_scenario_3,
                                sim_measure_type, (tokenizers['2_GRAM'], 0.3),
                                True)
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with join attribute of type float.'
        yield test_function,

# Test each similarity measure with a tokenizer with return_set flag set to False.
    for sim_measure_type in sim_measure_types:
        tok = QgramTokenizer(2)
        test_function = partial(test_valid_join, test_scenario_1,
                                sim_measure_type, (tok, 0.3))
        test_function.description = 'Test ' + sim_measure_type + \
                    ' with a tokenizer with return_set flag set to False .'
        yield test_function,

    # Test each similarity measure with allow_empty set to True.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 0.7, '>=', True))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with allow_empty set to True.'
        yield test_function,

    # Test each similarity measure with allow_empty set to True and with output attributes.
    for sim_measure_type in sim_measure_types:
        test_function = partial(test_valid_join, test_scenario_1,
                                sim_measure_type,
                                (tokenizers['SPACE_DELIMITER'], 0.7, '>=',
                                 True, False, ['A.name'], ['B.name']))
        test_function.description = 'Test ' + sim_measure_type + \
                    ' with allow_empty set to True and with output attributes.'
        yield test_function,
Ejemplo n.º 32
0
    def block_tuples(self,
                     ltuple,
                     rtuple,
                     l_overlap_attr,
                     r_overlap_attr,
                     rem_stop_words=False,
                     q_val=None,
                     word_level=True,
                     overlap_size=1,
                     allow_missing=False):
        """Blocks a tuple pair based on the overlap of token sets of attribute
           values.
        
        Args:
            ltuple (Series): The input left tuple.

            rtuple (Series): The input right tuple.
            
            l_overlap_attr (string): The overlap attribute in left tuple.

            r_overlap_attr (string): The overlap attribute in right tuple.

            rem_stop_words (boolean): A flag to indicate whether stop words
                                      (e.g., a, an, the) should be removed
                                      from the token sets of the overlap
                                      attribute values (defaults to False).

            q_val (int): A value of q to use if the overlap attributes values
                         are to be tokenized as qgrams (defaults to None).
 
            word_level (boolean): A flag to indicate whether the overlap
                                  attributes should be tokenized as words
                                  (i.e, using whitespace as delimiter)
                                  (defaults to True).

            overlap_size (int): The minimum number of tokens that must overlap
                                (defaults to 1).

            allow_missing (boolean): A flag to indicate whether a tuple pair
                                     with missing value in at least one of the
                                     blocking attributes should be blocked
                                     (defaults to False). If this flag is set
                                     to True, the pair will be kept if either
                                     ltuple has missing value in l_block_attr
                                     or rtuple has missing value in r_block_attr
                                     or both.

        Returns:
            A status indicating if the tuple pair is blocked (boolean).

        Examples:
            >>> import py_entitymatching as em
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = em.OverlapBlocker()
            >>> status = ob.block_tuples(A.ix[0], B.ix[0], 'address', 'address')

        """

        # validate data types of input parameters specific to overlap blocker
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level,
                                         overlap_size)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # determine which tokenizer to use
        if word_level == True:
            # # create a whitespace tokenizer
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            # # create a qgram tokenizer
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        # # cleanup the tuples from non-ascii characters, punctuations, and stop words
        l_val = self.cleanup_tuple_val(ltuple[l_overlap_attr], rem_stop_words)
        r_val = self.cleanup_tuple_val(rtuple[r_overlap_attr], rem_stop_words)

        # create a filter for overlap similarity
        overlap_filter = OverlapFilter(tokenizer,
                                       overlap_size,
                                       allow_missing=allow_missing)

        return overlap_filter.filter_pair(l_val, r_val)
    def test_apply_matcher_with_allow_missing(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(lambda row: sim_func(
                tok.tokenize(str(row[self.l_join_attr])),
                tok.tokenize(str(row[self.r_join_attr]))),
            axis=1)

        # compute expected output pairs
        comp_fn = COMP_OP_MAP[comp_op]
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join((str(row[self.l_key_attr]),
                                             str(row[self.r_key_attr]))))

        # find pairs that need to be included in output due to
        # the presence of missing value in one of the join attributes.
        missing_pairs = set()
        for l_idx, l_row in self.orig_ltable.iterrows():
            for r_idx, r_row in self.orig_rtable.iterrows():
                if (pd.isnull(l_row[self.l_join_attr]) or
                    pd.isnull(r_row[self.r_join_attr])):
                    missing_pairs.add(','.join((str(l_row[self.l_key_attr]),
                                                str(r_row[self.r_key_attr]))))

        # add the pairs containing missing value to the set of expected pairs.
        expected_pairs = expected_pairs.union(missing_pairs)

        # use overlap filter to obtain a candset with allow_missing set to True. 
        overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True)
        candset = overlap_filter.filter_tables(self.orig_ltable, self.orig_rtable,
                              self.l_key_attr, self.r_key_attr,
                              self.l_join_attr, self.r_join_attr)

        # apply a jaccard matcher to the candset with allow_missing set to True.
        output_candset = apply_matcher(candset,
            DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr,
            self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr,
            self.l_join_attr, self.r_join_attr, tok, sim_func, threshold,
            comp_op, True, out_sim_score=True)

        expected_output_attrs=['_id',
                               DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                               DEFAULT_R_OUT_PREFIX + self.r_key_attr,
                               '_sim_score']

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                                       str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))