def test_filter_pair(self, lstring, rstring, tokenizer, sim_measure_type,
                      threshold, allow_empty, allow_missing,
                      expected_output):
     position_filter = PositionFilter(tokenizer, sim_measure_type,
                                      threshold, allow_empty, allow_missing)
     actual_output = position_filter.filter_pair(lstring, rstring)
     assert_equal(actual_output, expected_output)
Ejemplo n.º 2
0
class JaccardTestCase(unittest.TestCase):
    def setUp(self):
        self.threshold = 0.3
        self.matches_using_cart_prod = sim_match(
            table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
            r_attr, get_jaccard_fn(), self.threshold, ['id'], ['id'])
        self.size_filter = SizeFilter(table_A, tokenized_table_A, l_attr, tok)
        self.size_filter.build_index()
        self.prefix_filter = PrefixFilter(table_A, tokenized_table_A, l_attr,
                                          tok, self.threshold, token_ordering)
        self.prefix_filter.build_index()
        self.position_filter = PositionFilter(table_A, tokenized_table_A,
                                              l_attr, tok, self.threshold,
                                              token_ordering)
        self.position_filter.build_index()
        self.suffix_filter = SuffixFilter(table_A, tokenized_table_A, l_attr,
                                          tok, self.threshold, token_ordering)

    def test_jaccard_match(self):
        # test jaccard with position filter, size filter, suffix filter
        matches = jaccard_match(
            table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
            r_attr, self.threshold,
            [self.position_filter, self.size_filter, self.suffix_filter],
            ['id'], ['id'])
        self.assertTrue(compare_matches(self.matches_using_cart_prod, matches))

        # test jaccard with prefix filter, size filter, suffix filter
        matches = jaccard_match(
            table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
            r_attr, self.threshold,
            [self.prefix_filter, self.size_filter, self.suffix_filter], ['id'],
            ['id'])
        self.assertTrue(compare_matches(self.matches_using_cart_prod, matches))
Ejemplo n.º 3
0
def jaccard_join_auto(ltable,
                      rtable,
                      l_id_attr,
                      l_join_attr,
                      r_id_attr,
                      r_join_attr,
                      threshold,
                      ltable_output_attrs=None,
                      rtable_output_attrs=None):
    matches_list = []
    sim_function = get_jaccard_fn()
    token_ordering = gen_token_ordering(ltable, l_join_attr)
    position_filter = PositionFilter(ltable,
                                     l_id_attr,
                                     l_join_attr,
                                     threshold,
                                     token_ordering,
                                     adaptive_prefix=True)
    position_filter.build_index()

    prog_bar = pyprind.ProgBar(len(rtable.index))

    l_row_dict = {}
    for idx, l_row in ltable.iterrows():
        l_id = l_row[l_id_attr]
        l_row_dict[l_id] = l_row

    r_row_dict = {}
    for idx, r_row in rtable.iterrows():
        r_id = r_row[r_id_attr]
        r_row_dict[r_id] = r_row

    for r_id in r_row_dict.keys():
        r_row = r_row_dict[r_id]
        r_tokens = order_using_token_ordering(list(r_row[r_join_attr]),
                                              token_ordering)
        r_num_tokens = len(r_tokens)

        l_cand_ids = position_filter.find_candidates(r_tokens, r_num_tokens,
                                                     threshold)
        for l_id in l_cand_ids:
            l_row = l_row_dict[l_id]
            if sim_function(l_row[l_join_attr],
                            r_row[r_join_attr]) >= threshold:
                match_dict = get_output_attributes(l_row, r_row, l_id_attr,
                                                   l_id, r_id_attr, r_id,
                                                   ltable_output_attrs,
                                                   rtable_output_attrs)
                matches_list.append(match_dict)
            #  matches_list.append(str(l_id)+','+str(r_id))
        prog_bar.update()

    output_matches = pd.DataFrame(matches_list)
    return output_matches
Ejemplo n.º 4
0
 def setUp(self):
     self.threshold = 0.3
     self.matches_using_cart_prod = sim_match(
         table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
         r_attr, get_jaccard_fn(), self.threshold, ['id'], ['id'])
     self.size_filter = SizeFilter(table_A, tokenized_table_A, l_attr, tok)
     self.size_filter.build_index()
     self.prefix_filter = PrefixFilter(table_A, tokenized_table_A, l_attr,
                                       tok, self.threshold, token_ordering)
     self.prefix_filter.build_index()
     self.position_filter = PositionFilter(table_A, tokenized_table_A,
                                           l_attr, tok, self.threshold,
                                           token_ordering)
     self.position_filter.build_index()
     self.suffix_filter = SuffixFilter(table_A, tokenized_table_A, l_attr,
                                       tok, self.threshold, token_ordering)
    def test_filter_tables(self, tokenizer, sim_measure_type, threshold,
                           allow_empty, allow_missing, args, expected_pairs):
        position_filter = PositionFilter(tokenizer, sim_measure_type,
                                         threshold, allow_empty, allow_missing)
        actual_candset = position_filter.filter_tables(*args)

        expected_output_attrs = ['_id']
        l_out_prefix = self.default_l_out_prefix
        r_out_prefix = self.default_r_out_prefix

        # Check for l_out_prefix in args.
        if len(args) > 8:
            l_out_prefix = args[8]
        expected_output_attrs.append(l_out_prefix + args[2])

        # Check for r_out_prefix in args.
        if len(args) > 9:
            r_out_prefix = args[9]
        expected_output_attrs.append(r_out_prefix + args[3])

        # Check for l_out_attrs in args.
        if len(args) > 6:
            if args[6]:
                l_out_attrs = remove_redundant_attrs(args[6], args[2])
                for attr in l_out_attrs:
                    expected_output_attrs.append(l_out_prefix + attr)

        # Check for r_out_attrs in args.
        if len(args) > 7:
            if args[7]:
                r_out_attrs = remove_redundant_attrs(args[7], args[3])
                for attr in r_out_attrs:
                    expected_output_attrs.append(r_out_prefix + attr)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_candset.columns.values),
                          expected_output_attrs)

        actual_pairs = set()
        for idx, row in actual_candset.iterrows():
            actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]),
                                       str(row[r_out_prefix + args[3]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
    def test_filter_tables(self, tokenizer, sim_measure_type, threshold,
                           allow_empty, allow_missing, args, expected_pairs):
        position_filter = PositionFilter(tokenizer, sim_measure_type, threshold,
                                         allow_empty, allow_missing)
        actual_candset = position_filter.filter_tables(*args)

        expected_output_attrs = ['_id']
        l_out_prefix = self.default_l_out_prefix
        r_out_prefix = self.default_r_out_prefix

        # Check for l_out_prefix in args.
        if len(args) > 8:
            l_out_prefix = args[8]
        expected_output_attrs.append(l_out_prefix + args[2])

        # Check for r_out_prefix in args.
        if len(args) > 9:
            r_out_prefix = args[9]
        expected_output_attrs.append(r_out_prefix + args[3])

        # Check for l_out_attrs in args.
        if len(args) > 6:
            if args[6]:
                l_out_attrs = remove_redundant_attrs(args[6], args[2])
                for attr in l_out_attrs:
                    expected_output_attrs.append(l_out_prefix + attr)

        # Check for r_out_attrs in args.
        if len(args) > 7:
            if args[7]:
                r_out_attrs = remove_redundant_attrs(args[7], args[3])
                for attr in r_out_attrs:
                    expected_output_attrs.append(r_out_prefix + attr)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_candset.columns.values),
                          expected_output_attrs)

        actual_pairs = set()
        for idx, row in actual_candset.iterrows():
            actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]),
                                       str(row[r_out_prefix + args[3]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
    def test_filter_candset(self, tokenizer, sim_measure_type, threshold,
                            allow_empty, allow_missing, args, expected_pairs):
        position_filter = PositionFilter(tokenizer, sim_measure_type,
                                         threshold, allow_empty, allow_missing)
        actual_output_candset = position_filter.filter_candset(*args)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_output_candset.columns.values),
                          list(args[0].columns.values))

        actual_pairs = set()
        for idx, row in actual_output_candset.iterrows():
            actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
    def test_filter_candset(self, tokenizer, sim_measure_type, threshold,
                            allow_empty, allow_missing, args, expected_pairs):
        position_filter = PositionFilter(tokenizer, sim_measure_type, threshold,
                                         allow_empty, allow_missing)
        actual_output_candset = position_filter.filter_candset(*args)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_output_candset.columns.values),
                          list(args[0].columns.values))

        actual_pairs = set()
        for idx, row in actual_output_candset.iterrows():
            actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
 def test_invalid_r_out_attr(self):
     position_filter = PositionFilter(self.tokenizer, self.sim_measure_type,
                                      self.threshold)
     position_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr',
                                   'B.attr', ['A.attr'], ['B.invalid_attr'])
 def test_numeric_r_filter_attr(self):
     position_filter = PositionFilter(self.tokenizer, self.sim_measure_type,
                                      self.threshold)
     position_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr',
                                   'B.int_attr')
 def test_invalid_rtable(self):
     position_filter = PositionFilter(self.tokenizer, self.sim_measure_type,
                                      self.threshold)
     position_filter.filter_tables(self.A, [], 'A.id', 'B.id', 'A.attr',
                                   'B.attr')
 def test_invalid_threshold(self):
     position_filter = PositionFilter(self.tokenizer, self.sim_measure_type,
                                      1.2)
Ejemplo n.º 13
0
 def setUp(self):
     self.position_filter = PositionFilter(A, A_tokenized, 'str', tok, 0.8,
                                           token_ordering)
     self.position_filter.build_index()
Ejemplo n.º 14
0
def set_sim_join(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr, tokenizer, sim_measure_type,
                 threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs,
                 l_out_prefix, r_out_prefix, out_sim_score, show_progress):
    """Perform set similarity join for a split of ltable and rtable"""

    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
        [ltable, rtable], [l_join_attr_index, r_join_attr_index], tokenizer,
        sim_measure_type)

    # Build position index on l_join_attr
    position_index = PositionIndex(ltable, l_join_attr_index, tokenizer,
                                   sim_measure_type, threshold, token_ordering)
    # While building the index, we cache the tokens and the empty records.
    # We cache the tokens so that we need not tokenize each string in
    # l_join_attr multiple times when we need to compute the similarity measure.
    # Further we cache the empty record ids to handle the allow_empty flag.
    cached_data = position_index.build(allow_empty, cache_tokens=True)
    l_empty_records = cached_data['empty_records']
    cached_l_tokens = cached_data['cached_tokens']

    pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold)

    sim_fn = get_sim_function(sim_measure_type)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable))
    k = 0
    for r_row in rtable:
        r_string = r_row[r_join_attr_index]

        # order the tokens using the token ordering.
        r_ordered_tokens = order_using_token_ordering(
            tokenizer.tokenize(r_string), token_ordering)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining
        # the current rtable record with those records in ltable with empty set
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the position
        # index.
        if allow_empty and len(r_ordered_tokens) == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                        ltable[l_id], r_row, l_key_attr_index,
                        r_key_attr_index, l_out_attrs_indices,
                        r_out_attrs_indices)
                else:
                    output_row = [
                        ltable[l_id][l_key_attr_index], r_row[r_key_attr_index]
                    ]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # obtain candidates by applying position filter.
        candidate_overlap = pos_filter.find_candidates(r_ordered_tokens,
                                                       position_index)

        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                l_ordered_tokens = cached_l_tokens[cand]
                k += 1
                # compute the actual similarity score
                sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens)

                if comp_fn(sim_score, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                            ltable[cand], r_row, l_key_attr_index,
                            r_key_attr_index, l_out_attrs_indices,
                            r_out_attrs_indices)
                    else:
                        output_row = [
                            ltable[cand][l_key_attr_index],
                            r_row[r_key_attr_index]
                        ]

                    # if out_sim_score flag is set, append the similarity score
                    # to the output record.
                    if out_sim_score:
                        output_row.append(sim_score)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()
    print 'k : ', k
    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Ejemplo n.º 15
0
def set_sim_join(ltable, rtable,
                 l_columns, r_columns,
                 l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr,
                 tokenizer, sim_measure_type, threshold, comp_op,
                 allow_empty,
                 l_out_attrs, r_out_attrs,
                 l_out_prefix, r_out_prefix,
                 out_sim_score, show_progress):
    """Perform set similarity join for a split of ltable and rtable"""

    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable, rtable],
                         [l_join_attr_index, r_join_attr_index],
                         tokenizer, sim_measure_type)

    # Build position index on l_join_attr
    position_index = PositionIndex(ltable, l_join_attr_index,
                                   tokenizer, sim_measure_type,
                                   threshold, token_ordering)
    # While building the index, we cache the tokens and the empty records.
    # We cache the tokens so that we need not tokenize each string in 
    # l_join_attr multiple times when we need to compute the similarity measure.
    # Further we cache the empty record ids to handle the allow_empty flag.
    cached_data = position_index.build(allow_empty, cache_tokens=True)
    l_empty_records = cached_data['empty_records']
    cached_l_tokens = cached_data['cached_tokens']

    pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold)

    sim_fn = get_sim_function(sim_measure_type)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable))
    k = 0
    for r_row in rtable:
        r_string = r_row[r_join_attr_index]

        # order the tokens using the token ordering.
        r_ordered_tokens = order_using_token_ordering(
                tokenizer.tokenize(r_string), token_ordering)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining 
        # the current rtable record with those records in ltable with empty set 
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the position
        # index.
        if allow_empty and len(r_ordered_tokens) == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable[l_id], r_row,
                                     l_key_attr_index, r_key_attr_index,
                                     l_out_attrs_indices,
                                     r_out_attrs_indices)
                else:
                    output_row = [ltable[l_id][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # obtain candidates by applying position filter.            
        candidate_overlap = pos_filter.find_candidates(r_ordered_tokens,
                                                       position_index)

        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                l_ordered_tokens = cached_l_tokens[cand]
                k += 1
                # compute the actual similarity score
                sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens)

                if comp_fn(sim_score, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                                         ltable[cand], r_row,
                                         l_key_attr_index, r_key_attr_index,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                    else:
                        output_row = [ltable[cand][l_key_attr_index],
                                      r_row[r_key_attr_index]]

                    # if out_sim_score flag is set, append the similarity score    
                    # to the output record.  
                    if out_sim_score:
                        output_row.append(sim_score)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()
    print 'k : ', k
    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
 def test_invalid_r_out_attr(self):
     position_filter = PositionFilter(self.tokenizer, self.sim_measure_type,
                                      self.threshold)
     position_filter.filter_tables(self.A, self.B, 'A.id', 'B.id',
                                   'A.attr', 'B.attr',
                                   ['A.attr'], ['B.invalid_attr'])
 def test_numeric_r_filter_attr(self):                                       
     position_filter = PositionFilter(self.tokenizer, self.sim_measure_type, 
                                      self.threshold)                        
     position_filter.filter_tables(self.A, self.B, 'A.id', 'B.id',           
                                   'A.attr', 'B.int_attr')
 def test_invalid_rtable(self):
     position_filter = PositionFilter(self.tokenizer, self.sim_measure_type,
                                      self.threshold)
     position_filter.filter_tables(self.A, [], 'A.id', 'B.id',
                                   'A.attr', 'B.attr')
 def test_invalid_tokenizer_for_edit_distance(self):
     position_filter = PositionFilter(self.tokenizer, 'EDIT_DISTANCE', 2)
 def test_invalid_sim_measure_type(self):
     position_filter = PositionFilter(self.tokenizer, 'INVALID_TYPE',
                                      self.threshold)
Ejemplo n.º 21
0
def _set_sim_join_split(ltable, rtable,
                        l_key_attr, r_key_attr,
                        l_join_attr, r_join_attr,
                        tokenizer,
                        sim_measure_type,
                        threshold,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix,
                        out_sim_score):
    """Perform set similarity join for a split of ltable and rtable"""

    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_join_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_join_attr_index)

    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable_dict.values(),
                          rtable_dict.values()],
                         [l_join_attr_index,
                          r_join_attr_index],
                         tokenizer, sim_measure_type)

    # build a dictionary of tokenized l_join_attr
    l_join_attr_dict = {}
    for row in ltable_dict.values():
        l_join_attr_dict[row[l_key_attr_index]] = order_using_token_ordering(
            tokenize(str(row[l_join_attr_index]), tokenizer, sim_measure_type),
                                                  token_ordering)

    # Build position index on l_join_attr
    position_index = PositionIndex(ltable_dict.values(),
                                   l_key_attr_index, l_join_attr_index,
                                   tokenizer, sim_measure_type,
                                   threshold, token_ordering)
    position_index.build()

    pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold)
    suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold)
    sim_fn = get_sim_function(sim_measure_type)
    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable_dict.keys()))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_join_attr_index])
        # check for empty string
        if not r_string:
            continue
        r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type)
        r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens,
                                                      token_ordering)
        r_num_tokens = len(r_ordered_tokens)
        r_prefix_length = get_prefix_length(r_num_tokens,
                                            sim_measure_type,
                                            threshold, tokenizer)     

        candidate_overlap = find_candidates_position_filter(
                                r_ordered_tokens, r_num_tokens, r_prefix_length,
                                pos_filter, position_index)
        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                l_ordered_tokens = l_join_attr_dict[cand]
                l_num_tokens = position_index.get_size(cand)
                l_prefix_length = get_prefix_length(
                                      l_num_tokens,
                                      sim_measure_type,
                                      threshold, tokenizer)
                if not suffix_filter._filter_suffix(
                           l_ordered_tokens[l_prefix_length:],
                           r_ordered_tokens[r_prefix_length:],
                           l_prefix_length,
                           r_prefix_length,
                           l_num_tokens, r_num_tokens):
                    sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens)
                    if sim_score >= threshold:
                        if has_output_attributes:
                            output_row = get_output_row_from_tables(
                                             ltable_dict[cand], r_row,
                                             cand, r_id,
                                             l_out_attrs_indices,
                                             r_out_attrs_indices)
                            if out_sim_score:
                                output_row.append(sim_score)
                            output_rows.append(output_row)
                        else:
                            output_row = [cand, r_id]
                            if out_sim_score:
                                output_row.append(sim_score)
                            output_rows.append(output_row)
        prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
Ejemplo n.º 22
0
class PositionFilterTestCase(unittest.TestCase):
    def setUp(self):
        self.position_filter = PositionFilter(A, A_tokenized, 'str', tok, 0.8,
                                              token_ordering)
        self.position_filter.build_index()

    def test_apply_filter(self):
        # position filter satisfies
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering(['fg', 'cd', 'aa', 'ef'],
                                              token_ordering)
        self.assertTrue(
            self.position_filter.apply_filter(l_tokens,
                                              r_tokens, len(l_tokens),
                                              len(r_tokens), 0.8))

        # position filter doesn't satisfy
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering(['fg'], token_ordering)
        self.assertFalse(
            self.position_filter.apply_filter(l_tokens,
                                              r_tokens, len(l_tokens),
                                              len(r_tokens), 0.8))

        # prefix filter satisfies but position filter doesn't satisfy
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering(['aa'], token_ordering)
        self.assertFalse(
            self.position_filter.apply_filter(l_tokens,
                                              r_tokens, len(l_tokens),
                                              len(r_tokens), 0.8))

        # test empty list of tokens
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering([], token_ordering)
        self.assertFalse(
            self.position_filter.apply_filter(l_tokens,
                                              r_tokens, len(l_tokens),
                                              len(r_tokens), 0.8))
        self.assertFalse(
            self.position_filter.apply_filter(r_tokens,
                                              l_tokens, len(r_tokens),
                                              len(l_tokens), 0.8))

    def test_find_candidates(self):
        # test default case (presence of candidates)
        tokens = order_using_token_ordering(['aa', 'ef', 'ab', 'cd'],
                                            token_ordering)
        self.assertSetEqual(
            self.position_filter.find_candidates(tokens, len(tokens), 0.8),
            set([0, 3]))

        # test empty set of candidates
        tokens = order_using_token_ordering(['op', 'lp', 'mp'], token_ordering)
        self.assertSetEqual(
            self.position_filter.find_candidates(tokens, len(tokens), 0.8),
            set())

        # prefix index returns 2 candidates where as position index prunes them
        tokens = order_using_token_ordering(['aa', 'ef', 'lp'], token_ordering)
        self.assertSetEqual(
            self.position_filter.find_candidates(tokens, len(tokens), 0.8),
            set())

        # test empty list of probe tokens
        tokens = order_using_token_ordering([], token_ordering)
        self.assertSetEqual(
            self.position_filter.find_candidates(tokens, len(tokens), 0.8),
            set())
 def test_filter_pair(self, lstring, rstring, tokenizer, sim_measure_type,
                      threshold, allow_empty, allow_missing, expected_output):
     position_filter = PositionFilter(tokenizer, sim_measure_type, threshold,
                                      allow_empty, allow_missing)
     actual_output = position_filter.filter_pair(lstring, rstring)
     assert_equal(actual_output, expected_output)