def setUp(self):
     self.dlm = create_delimiter_tokenizer(' ')
     self.A = pd.DataFrame([{'id': 1, 'attr':'ab cd ef aa bb'},
                            {'id': 2, 'attr':''},
                            {'id': 3, 'attr':'ab'},
                            {'id': 4, 'attr':'ll oo he'},
                            {'id': 5, 'attr':'xy xx zz fg'}])
     self.B = pd.DataFrame([{'id': 1, 'attr':'zz fg xx'},
                            {'id': 2, 'attr':'he ll'},
                            {'id': 3, 'attr':'xz pl ou'},
                            {'id': 4, 'attr':'aa'},
                            {'id': 5, 'attr':'fg cd aa ef ab'}])
     self.empty_table = pd.DataFrame(columns=['id', 'attr'])
     self.default_l_out_prefix = 'l_'
     self.default_r_out_prefix = 'r_'
Ejemplo n.º 2
0
    def setUp(self):
        self.dlm = create_delimiter_tokenizer(' ')
        self.A = pd.DataFrame([{
            'l_id': 1,
            'l_attr': 'ab cd ef aa bb'
        }, {
            'l_id': 2,
            'l_attr': ''
        }, {
            'l_id': 3,
            'l_attr': 'ab'
        }, {
            'l_id': 4,
            'l_attr': 'll oo pp'
        }, {
            'l_id': 5,
            'l_attr': 'xy xx zz fg'
        }])
        self.B = pd.DataFrame([{
            'r_id': 1,
            'r_attr': 'mn'
        }, {
            'r_id': 2,
            'r_attr': 'he ll'
        }, {
            'r_id': 3,
            'r_attr': 'xy pl ou'
        }, {
            'r_id': 4,
            'r_attr': 'aa'
        }, {
            'r_id': 5,
            'r_attr': 'fg cd aa ef'
        }])

        # generate cartesian product A x B to be used as candset
        self.A['tmp_join_key'] = 1
        self.B['tmp_join_key'] = 1
        self.C = pd.merge(self.A[['l_id', 'tmp_join_key']],
                          self.B[['r_id', 'tmp_join_key']],
                          on='tmp_join_key').drop('tmp_join_key', 1)

        self.empty_A = pd.DataFrame(columns=['l_id', 'l_attr'])
        self.empty_B = pd.DataFrame(columns=['r_id', 'r_attr'])
        self.empty_candset = pd.DataFrame(columns=['l_id', 'r_id'])
Ejemplo n.º 3
0
 def setUp(self):
     self.A = pd.DataFrame([{'A.id':1, 'A.attr':'hello'}])
     self.B = pd.DataFrame([{'B.id':1, 'B.attr':'world'}])
     self.tokenizer = create_delimiter_tokenizer()
     self.sim_measure_type = 'JACCARD'
     self.threshold = 0.8
Ejemplo n.º 4
0
 def setUp(self):
     self.dlm = create_delimiter_tokenizer(' ')
Ejemplo n.º 5
0
 def time_overlap_delim_1(self):
     dl = create_delimiter_tokenizer()
     overlap_join(self.ltable, self.rtable, 'id', 'id', 'attr', 'attr', dl,
                  1)
Ejemplo n.º 6
0
 def time_cosine_delim_07(self):
     dl = create_delimiter_tokenizer()
     cosine_join(self.ltable, self.rtable, 'id', 'id', 'attr', 'attr', dl,
                 0.7)
Ejemplo n.º 7
0
 def setUp(self):
     self.A = pd.DataFrame([{'A.id': 1, 'A.attr': 'hello'}])
     self.B = pd.DataFrame([{'B.id': 1, 'B.attr': 'world'}])
     self.tokenizer = create_delimiter_tokenizer()
     self.threshold = 1
Ejemplo n.º 8
0
def test_set_sim_join():
    # data to be tested.
    test_scenario_1 = [('data/table_A.csv', 'A.ID', 'A.name'),
                       ('data/table_B.csv', 'B.ID', 'B.name')]
    data = {'TEST_SCENARIO_1': test_scenario_1}

    # similarity measures to be tested.
    sim_measure_types = ['COSINE', 'DICE', 'JACCARD', 'OVERLAP']

    # similarity thresholds to be tested.
    thresholds = {
        'JACCARD': [0.3, 0.5, 0.7, 0.85, 1],
        'COSINE': [0.3, 0.5, 0.7, 0.85, 1],
        'DICE': [0.3, 0.5, 0.7, 0.85, 1],
        'OVERLAP': [1, 2, 3]
    }

    # tokenizers to be tested.
    tokenizers = {
        'SPACE_DELIMITER': create_delimiter_tokenizer(),
        '2_GRAM': create_qgram_tokenizer(),
        '3_GRAM': create_qgram_tokenizer(3)
    }

    # Test each combination of similarity measure, threshold and tokenizer
    # for different test scenarios.
    for label, scenario in iteritems(data):
        for sim_measure_type in sim_measure_types:
            for threshold in thresholds.get(sim_measure_type):
                for tok_type, tok in iteritems(tokenizers):
                    test_function = partial(test_valid_join, scenario,
                                            sim_measure_type, (tok, threshold))
                    test_function.description = 'Test ' + sim_measure_type + \
                        ' with ' + str(threshold) + ' threshold and ' + \
                        tok_type + ' tokenizer for ' + label + '.'
                    yield test_function,

    # Test each similarity measure with output attributes added.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 1, ['A.birth_year', 'A.zipcode'
                                                ], ['B.name', 'B.zipcode']))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with output attributes.'
        yield test_function,

    # Test each similarity measure with a different output prefix.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 1, ['A.birth_year', 'A.zipcode'],
             ['B.name', 'B.zipcode'], 'ltable.', 'rtable.'))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with output attributes and prefix.'
        yield test_function,

    # Test each similarity measure with output_sim_score disabled.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 1, ['A.birth_year', 'A.zipcode'],
             ['B.name', 'B.zipcode'], 'ltable.', 'rtable.', False))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with sim_score disabled.'
        yield test_function,