Ejemplo n.º 1
0
def _assemble_topk_table(topk_heap,
                         ltable,
                         rtable,
                         ret_key='_id',
                         l_output_prefix='ltable_',
                         r_output_prefix='rtable_'):
    topk_heap.sort(key=lambda tup: tup[0], reverse=True)
    ret_data_col_name_list = ['_id', 'similarity']
    ltable_col_names = list(ltable.columns)
    rtable_col_names = list(rtable.columns)
    lkey = em.get_key(ltable)
    rkey = em.get_key(rtable)
    lkey_index = 0
    rkey_index = 0
    for i in range(len(ltable_col_names)):
        if ltable_col_names[i] == lkey:
            lkey_index = i

    for i in range(len(rtable_col_names)):
        if rtable_col_names[i] == rkey:
            rkey_index = i

    ret_data_col_name_list.append(l_output_prefix + lkey)
    ret_data_col_name_list.append(r_output_prefix + rkey)
    ltable_col_names.remove(lkey)
    rtable_col_names.remove(rkey)

    for i in range(len(ltable_col_names)):
        ret_data_col_name_list.append(l_output_prefix + ltable_col_names[i])
    for i in range(len(rtable_col_names)):
        ret_data_col_name_list.append(r_output_prefix + rtable_col_names[i])

    ret_tuple_list = []
    for i in range(len(topk_heap)):
        tup = topk_heap[i]
        lrecord = list(ltable.ix[tup[1]])
        rrecord = list(rtable.ix[tup[2]])
        ret_tuple = [i, tup[0]]
        ret_tuple.append(lrecord[lkey_index])
        ret_tuple.append(rrecord[rkey_index])
        for j in range(len(lrecord)):
            if j != lkey_index:
                ret_tuple.append(lrecord[j])
        for j in range(len(rrecord)):
            if j != rkey_index:
                ret_tuple.append(rrecord[j])
        ret_tuple_list.append(ret_tuple)

    data_frame = pd.DataFrame(ret_tuple_list)
    # When the ret data frame is empty, we cannot assign column names.
    if len(data_frame) == 0:
        return data_frame

    data_frame.columns = ret_data_col_name_list
    lkey = em.get_key(ltable)
    rkey = em.get_key(rtable)
    cm.set_candset_properties(data_frame, ret_key, l_output_prefix + lkey,
                              r_output_prefix + rkey, ltable, rtable)

    return data_frame
    def test_assemble_topk_table_2(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        A_key = em.get_key(A)
        B_key = em.get_key(B)
        topk_heap = [(0.2727272727272727, 1, 0), (0.23076923076923078, 0, 4),
                     (0.16666666666666666, 0, 3)]
        ret_dataframe = db._assemble_topk_table(topk_heap, A, B, A_key, B_key)
        expected_columns = ['_id', 'ltable_ID', 'rtable_ID',
                            'ltable_name', 'ltable_birth_year',
                            'ltable_hourly_wage',
                            'ltable_address', 'ltable_zipcode', 'rtable_name',
                            'rtable_birth_year', 'rtable_hourly_wage',
                            'rtable_address', 'rtable_zipcode']
        self.assertEqual(len(ret_dataframe), 3)
        self.assertEqual(list(ret_dataframe.columns), expected_columns)

        expected_recs = [[0, 'a2', 'b1', 'Michael Franklin',
                          1988, 27.5, '1652 Stockton St, San Francisco',
                          94122, 'Mark Levene', 1987, 29.5,
                          '108 Clement St, San Francisco', 94107],
                         [1, 'a1', 'b5', 'Kevin Smith',
                          1989, 30.0, '607 From St, San Francisco', 94107,
                          'Alfons Kemper', 1984, 35.0,
                          '170 Post St, Apt 4,  San Francisco', 94122],
                         [2, 'a1', 'b4', 'Kevin Smith',
                          1989, 30.0, '607 From St, San Francisco', 94107,
                          'Joseph Kuan', 1982, 26.0,
                          '108 South Park, San Francisco', 94122]]
        self.assertEqual(list(ret_dataframe.loc[0]), expected_recs[0])
        self.assertEqual(list(ret_dataframe.loc[1]), expected_recs[1])
        self.assertEqual(list(ret_dataframe.loc[2]), expected_recs[2])
Ejemplo n.º 3
0
def validate_metadata_two_candsets(C, D):
    assert_equal(sorted(C.columns), sorted(D.columns))
    assert_equal(em.get_key(D), em.get_key(C))
    assert_equal(em.get_property(D, 'fk_ltable'),
                 em.get_property(C, 'fk_ltable'))
    assert_equal(em.get_property(D, 'fk_rtable'),
                 em.get_property(C, 'fk_rtable'))
    def test_assemble_topk_table_2(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        A_key = em.get_key(A)
        B_key = em.get_key(B)
        topk_heap = [(0.2727272727272727, 1, 0), (0.23076923076923078, 0, 4),
                     (0.16666666666666666, 0, 3)]
        ret_dataframe = db._assemble_topk_table(topk_heap, A, B, A_key, B_key)
        expected_columns = ['_id', 'ltable_ID', 'rtable_ID',
                            'ltable_name', 'ltable_birth_year',
                            'ltable_hourly_wage',
                            'ltable_address', 'ltable_zipcode', 'rtable_name',
                            'rtable_birth_year', 'rtable_hourly_wage',
                            'rtable_address', 'rtable_zipcode']
        self.assertEqual(len(ret_dataframe), 3)
        self.assertEqual(list(ret_dataframe.columns), expected_columns)

        expected_recs = [[0, 'a2', 'b1', 'Michael Franklin',
                          1988, 27.5, '1652 Stockton St, San Francisco',
                          94122, 'Mark Levene', 1987, 29.5,
                          '108 Clement St, San Francisco', 94107],
                         [1, 'a1', 'b5', 'Kevin Smith',
                          1989, 30.0, '607 From St, San Francisco', 94107,
                          'Alfons Kemper', 1984, 35.0,
                          '170 Post St, Apt 4,  San Francisco', 94122],
                         [2, 'a1', 'b4', 'Kevin Smith',
                          1989, 30.0, '607 From St, San Francisco', 94107,
                          'Joseph Kuan', 1982, 26.0,
                          '108 South Park, San Francisco', 94122]]
        self.assertEqual(list(ret_dataframe.ix[0]), expected_recs[0])
        self.assertEqual(list(ret_dataframe.ix[1]), expected_recs[1])
        self.assertEqual(list(ret_dataframe.ix[2]), expected_recs[2])
 def test_select_features_1(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     actual_selected_features = db._select_features(A, B, A_key, B_key)
     expected_selected_features = [1, 3, 4, 2, 5]
     self.assertEqual(actual_selected_features, expected_selected_features)
 def test_select_features_1(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     actual_selected_features = db._select_features(A, B, A_key, B_key)
     expected_selected_features = [1, 3, 4, 2, 5]
     self.assertEqual(actual_selected_features, expected_selected_features)
 def test_validate_types_1(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_ID',
             fk_rtable='rtable_ID', key = '_id')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     attr_corres = None
     db._validate_types(A, B, C, 100, attr_corres, False)
 def test_assemble_topk_table_1(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     topk_heap = []
     ret_dataframe = db._assemble_topk_table(topk_heap, A, B, A_key, B_key)
     self.assertEqual(len(ret_dataframe), 0)
     self.assertEqual(list(ret_dataframe.columns), [])
 def test_validate_types_1(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_ID',
             fk_rtable='rtable_ID', key = '_id')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     attr_corres = None
     db._validate_types(A, B, C, 100, attr_corres, False)
 def test_assemble_topk_table_1(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     topk_heap = []
     ret_dataframe = db._assemble_topk_table(topk_heap, A, B, A_key, B_key)
     self.assertEqual(len(ret_dataframe), 0)
     self.assertEqual(list(ret_dataframe.columns), [])
def _assemble_topk_table(topk_heap, ltable, rtable, ret_key='_id',
                         l_output_prefix='ltable_', r_output_prefix='rtable_'):
    topk_heap.sort(key=lambda tup: tup[0], reverse=True)
    ret_data_col_name_list = ['_id', 'similarity']
    ltable_col_names = list(ltable.columns)
    rtable_col_names = list(rtable.columns)
    lkey = em.get_key(ltable)
    rkey = em.get_key(rtable)
    lkey_index = 0
    rkey_index = 0
    for i in range(len(ltable_col_names)):
        if ltable_col_names[i] == lkey:
            lkey_index = i

    for i in range(len(rtable_col_names)):
        if rtable_col_names[i] == rkey:
            rkey_index = i

    ret_data_col_name_list.append(l_output_prefix + lkey)
    ret_data_col_name_list.append(r_output_prefix + rkey)
    ltable_col_names.remove(lkey)
    rtable_col_names.remove(rkey)

    for i in range(len(ltable_col_names)):
        ret_data_col_name_list.append(l_output_prefix + ltable_col_names[i])
    for i in range(len(rtable_col_names)):
        ret_data_col_name_list.append(r_output_prefix + rtable_col_names[i])

    ret_tuple_list = []
    for i in range(len(topk_heap)):
        tup = topk_heap[i]
        lrecord = list(ltable.ix[tup[1]])
        rrecord = list(rtable.ix[tup[2]])
        ret_tuple = [i, tup[0]]
        ret_tuple.append(lrecord[lkey_index])
        ret_tuple.append(rrecord[rkey_index])
        for j in range(len(lrecord)):
            if j != lkey_index:
                ret_tuple.append(lrecord[j])
        for j in range(len(rrecord)):
            if j != rkey_index:
                ret_tuple.append(rrecord[j])
        ret_tuple_list.append(ret_tuple)

    data_frame = pd.DataFrame(ret_tuple_list)
    # When the ret data frame is empty, we cannot assign column names.
    if len(data_frame) == 0:
        return data_frame

    data_frame.columns = ret_data_col_name_list
    lkey = em.get_key(ltable)
    rkey = em.get_key(rtable)
    cm.set_candset_properties(data_frame, ret_key, l_output_prefix + lkey,
                              r_output_prefix + rkey, ltable, rtable)

    return data_frame
 def test_select_features_3(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     corres_list = [(0, 0)]
     A_filtered, B_filtered = db._get_filtered_table(A, B, A_key, B_key, corres_list)
     actual_selected_features = db._select_features(
         A_filtered, B_filtered, A_key)
     expected_selected_features = []
     self.assertEqual(actual_selected_features, expected_selected_features)
Ejemplo n.º 13
0
 def test_get_field_correspondence_list_2(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     expected_list = [('ID', 'ID'), ('name', 'name'),
                      ('address', 'address'), ('zipcode', 'zipcode')]
     attr_corres = [('ID', 'ID'), ('name', 'name'), ('address', 'address'),
                    ('zipcode', 'zipcode')]
     corres_list = db._get_field_correspondence_list(
         A, B, A_key, B_key, attr_corres)
     self.assertEqual(corres_list, expected_list)
 def test_validate_types_2(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_' +
             A_key, fk_rtable='rtable_' + B_key, key = '_id')
     attr_corres  = [('ID', 'ID'), ('name', 'name'),
                      ('birth_year', 'birth_year'),
                      ('hourly_wage', 'hourly_wage'),
                      ('address', 'address'),
                      ('zipcode', 'zipcode')]
     db._validate_types(A, B, C, 100, attr_corres, False)
 def test_validate_types_2(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_' +
             A_key, fk_rtable='rtable_' + B_key, key = '_id')
     attr_corres  = [('ID', 'ID'), ('name', 'name'),
                      ('birth_year', 'birth_year'),
                      ('hourly_wage', 'hourly_wage'),
                      ('address', 'address'),
                      ('zipcode', 'zipcode')]
     db._validate_types(A, B, C, 100, attr_corres, False)
 def test_select_features_2(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     cols_A = list(A.columns)
     cols_B = list(B.columns)
     corres_list = [(cols_A[0], cols_B[0]), (cols_A[1], cols_B[1]), (cols_A[4],
                                                                     cols_B[4])]
     A_filtered, B_filtered = db._get_filtered_table(A, B, corres_list)
     actual_selected_features = db._select_features(
         A_filtered, B_filtered, A_key, B_key)
     expected_selected_features = [1, 2]
     self.assertEqual(actual_selected_features, expected_selected_features)
    def test_select_features_3(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        A_key = em.get_key(A)
        B_key = em.get_key(B)
        cols_A = list(A.columns)
        cols_B = list(B.columns)

        corres_list = [(cols_A[0], cols_B[0])]
        A_filtered, B_filtered = db._get_filtered_table(A, B, corres_list)
        actual_selected_features = db._select_features(
            A_filtered, B_filtered, A_key, B_key)
        expected_selected_features = []
        self.assertEqual(actual_selected_features, expected_selected_features)
 def test_get_field_correspondence_list_2(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     expected_list = [('ID', 'ID'), ('name', 'name'),
                      ('address', 'address'),
                      ('zipcode', 'zipcode')]
     attr_corres = [('ID', 'ID'), ('name', 'name'),
                      ('address', 'address'),
                      ('zipcode', 'zipcode')]
     corres_list = db._get_field_correspondence_list(
         A, B, A_key, B_key, attr_corres)
     self.assertEqual(corres_list, expected_list)
    def test_get_feature_weight_2(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        A_key = em.get_key(A)
        B_key = em.get_key(B)
        corres_list = [(0, 0), (1, 1), (4, 4), (5, 5)]
        A_filtered, B_filtered = db._get_filtered_table(
            A, B, A_key, B_key, corres_list)
        A_wlist = db._get_feature_weight(A_filtered)
        expected_A_wlist = [2.0, 2.0, 2.0, 1.4]
        self.assertEqual(A_wlist, expected_A_wlist)

        B_wlist = db._get_feature_weight(B_filtered)
        expected_B_wlist = [2.0, 2.0, 2.0, 1.3333333333333333]
        self.assertEqual(B_wlist, expected_B_wlist)
Ejemplo n.º 20
0
    def test_get_field_correspondence_list_5(self):
        A = pd.DataFrame([[0, 'A', 0.11, 'ASDF']])
        A.columns = ['ID', 'name', 'price', 'desc']
        em.set_key(A, 'ID')
        A_key = em.get_key(A)
        B = pd.DataFrame([['B', 'B001', 'ASDF', 0.111]])
        B.columns = ['item_name', 'item_id', 'item_desc', 'item_price']
        em.set_key(B, 'item_id')
        B_key = em.get_key(B)
        attr_corres = [('name', 'item_name'), ('price', 'item_price')]
        actual_attr_corres = db._get_field_correspondence_list(
            A, B, A_key, B_key, attr_corres)

        expected_attr_corres = [('name', 'item_name'), ('price', 'item_price'),
                                ('ID', 'item_id')]
        self.assertEqual(expected_attr_corres, actual_attr_corres)
 def test_filter_corres_list_2(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     ltable_col_dict = db._build_col_name_index_dict(A)
     rtable_col_dict = db._build_col_name_index_dict(B)
     attr_corres = [('ID', 'ID'), ('name', 'name'),
                      ('birth_year', 'birth_year'),
                      ('hourly_wage', 'hourly_wage'),
                      ('address', 'address'),
                      ('zipcode', 'zipcode')]
     expected_filtered_attr = [('ID', 'ID'), ('name', 'name'),
                      ('address', 'address')]
     db._filter_corres_list(A, B, A_key, B_key, ltable_col_dict,
             rtable_col_dict, attr_corres)
     self.assertEqual(expected_filtered_attr, attr_corres)
 def test_filter_corres_list_2(self):
     A = read_csv_metadata(path_a, key='ID')
     B = read_csv_metadata(path_b, key='ID')
     A_key = em.get_key(A)
     B_key = em.get_key(B)
     ltable_col_dict = db._build_col_name_index_dict(A)
     rtable_col_dict = db._build_col_name_index_dict(B)
     attr_corres = [('ID', 'ID'), ('name', 'name'),
                      ('birth_year', 'birth_year'),
                      ('hourly_wage', 'hourly_wage'),
                      ('address', 'address'),
                      ('zipcode', 'zipcode')]
     expected_filtered_attr = [('ID', 'ID'), ('name', 'name'),
                      ('address', 'address')]
     db._filter_corres_list(A, B, A_key, B_key, ltable_col_dict,
             rtable_col_dict, attr_corres)
     self.assertEqual(expected_filtered_attr, attr_corres)
    def test_select_features_4(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        A_key = em.get_key(A)
        B_key = em.get_key(B)
        cols_A = list(A.columns)
        cols_B = list(B.columns)

        A_field_set = [0, 1, 2]
        B_field_set = [0, 1, 2, 3]

        A_field_set = list(itemgetter(*A_field_set)(cols_A))
        B_field_set = list(itemgetter(*B_field_set)(cols_B))

        A_filtered = A[A_field_set]
        B_filtered = B[B_field_set]
        db._select_features(
            A_filtered, B_filtered, A_key, B_key)
Ejemplo n.º 24
0
    def test_get_feature_weight_2(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        A_key = em.get_key(A)
        B_key = em.get_key(B)
        cols_A = list(A.columns)
        cols_B = list(B.columns)
        corres_list = [(cols_A[0], cols_B[0]), (cols_A[1], cols_B[1]),
                       (cols_A[4], cols_B[4]), (cols_A[5], cols_B[5])]
        A_filtered, B_filtered = db._get_filtered_table(
            A, B, A_key, B_key, corres_list)
        A_wlist = db._get_feature_weight(A_filtered)
        expected_A_wlist = [2.0, 2.0, 2.0, 1.4]
        self.assertEqual(A_wlist, expected_A_wlist)

        B_wlist = db._get_feature_weight(B_filtered)
        expected_B_wlist = [2.0, 2.0, 2.0, 1.3333333333333333]
        self.assertEqual(B_wlist, expected_B_wlist)
    def test_get_field_correspondence_list_5(self):
        A = pd.DataFrame([[0, 'A', 0.11, 'ASDF']])
        A.columns = ['ID', 'name', 'price', 'desc']
        em.set_key(A, 'ID')
        A_key = em.get_key(A)
        B = pd.DataFrame([['B', 'B001', 'ASDF', 0.111]])
        B.columns = ['item_name', 'item_id', 'item_desc', 'item_price']
        em.set_key(B, 'item_id')
        B_key = em.get_key(B)
        attr_corres = [('name', 'item_name'),
                       ('price', 'item_price')]
        actual_attr_corres = db._get_field_correspondence_list(
            A, B, A_key, B_key, attr_corres)

        expected_attr_corres = [('name', 'item_name'),
                                ('price', 'item_price'),
                                ('ID', 'item_id')]
        self.assertEqual(expected_attr_corres, actual_attr_corres)
    def test_select_features_4(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        A_key = em.get_key(A)
        B_key = em.get_key(B)
        cols_A = list(A.columns)
        cols_B = list(B.columns)

        A_field_set = [0, 1, 2]
        B_field_set = [0, 1, 2, 3]

        A_field_set = list(itemgetter(*A_field_set)(cols_A))
        B_field_set = list(itemgetter(*B_field_set)(cols_B))

        A_filtered = A[A_field_set]
        B_filtered = B[B_field_set]
        db._select_features(
            A_filtered, B_filtered, A_key, B_key)
    def test_get_feature_weight_2(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        A_key = em.get_key(A)
        B_key = em.get_key(B)
        cols_A = list(A.columns)
        cols_B = list(B.columns)
        corres_list = [(cols_A[0], cols_B[0]), (cols_A[1], cols_B[1]), (cols_A[4],
                                                                        cols_B[4]),
                       (cols_A[5], cols_B[5])]
        A_filtered, B_filtered = db._get_filtered_table(
            A, B, corres_list)
        A_wlist = db._get_feature_weight(A_filtered)
        expected_A_wlist = [2.0, 2.0, 2.0, 1.4]
        self.assertEqual(A_wlist, expected_A_wlist)

        B_wlist = db._get_feature_weight(B_filtered)
        expected_B_wlist = [2.0, 2.0, 2.0, 1.3333333333333333]
        self.assertEqual(B_wlist, expected_B_wlist)
    def test_select_features_5(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        A_key = em.get_key(A)
        A_field_set = [0, 1, 2, 3]
        B_field_set = [0, 1, 2]

        A_filtered = A[A_field_set]
        B_filtered = B[B_field_set]
        db._select_features(
            A_filtered, B_filtered, A_key)
 def test_get_tokenized_table_2(self):
     B = read_csv_metadata(path_b, key='ID')
     B_key = em.get_key(B)
     feature_list = [0, 1, 3]
     actual_record_list = db._get_tokenized_table(B, B_key, feature_list)
     expected_record_list = [[('b1', 0), ('mark', 1), ('levene', 1), ('30', 2)], 
                             [('b2', 0), ('bill', 1), ('bridge', 1), ('32', 2)], 
                             [('b3', 0), ('mike', 1), ('franklin', 1), ('28', 2)], 
                             [('b4', 0), ('joseph', 1), ('kuan', 1), ('26', 2)], 
                             [('b5', 0), ('alfons', 1), ('kemper', 1), ('35', 2)], 
                             [('b6', 0), ('michael',1), ('brodie', 1), ('32', 2)]]
     self.assertEqual(actual_record_list, expected_record_list)
 def test_get_tokenized_table_2(self):
     B = read_csv_metadata(path_b, key='ID')
     B_key = em.get_key(B)
     feature_list = [0, 1, 3]
     actual_record_list = db._get_tokenized_table(B, B_key, feature_list)
     expected_record_list = [[('b1', 0), ('mark', 1), ('levene', 1), ('30', 2)], 
                             [('b2', 0), ('bill', 1), ('bridge', 1), ('32', 2)], 
                             [('b3', 0), ('mike', 1), ('franklin', 1), ('28', 2)], 
                             [('b4', 0), ('joseph', 1), ('kuan', 1), ('26', 2)], 
                             [('b5', 0), ('alfons', 1), ('kemper', 1), ('35', 2)], 
                             [('b6', 0), ('michael',1), ('brodie', 1), ('32', 2)]]
     self.assertEqual(actual_record_list, expected_record_list)
    def test_get_tokenized_table_2(self):
        B = read_csv_metadata(path_b, key='ID')
        B_key = em.get_key(B)
        feature_list = [0, 1, 3]
        actual_record_list = db._get_tokenized_table(B, B_key, feature_list)

        expected_record_list = []
        test_file_path = os.sep.join([debugblocker_datasets_path, 'test_get_tokenized_table_2.txt'])
        f = open(test_file_path, 'r')
        for line in f:
            expected_record_list.append(line.strip().split(' '))

        self.assertEqual(actual_record_list, expected_record_list)
    def test_get_tokenized_table_1(self):
        A = read_csv_metadata(path_a, key='ID')
        A_key = em.get_key(A)
        feature_list = range(len(A.columns))
        actual_record_list = db._get_tokenized_table(A, A_key, feature_list)

        expected_record_list = []
        test_file_path = os.sep.join([debugblocker_datasets_path, 'test_get_tokenized_table_1.txt'])
        f = open(test_file_path, 'r')
        for line in f:
            expected_record_list.append(line.strip().split(' '))

        self.assertEqual(actual_record_list, expected_record_list)
def validate_metadata(C, l_output_attrs=None, r_output_attrs=None,
                      l_output_prefix='ltable_', r_output_prefix='rtable_',
                      l_key='ID', r_key='ID'):
    s1 = ['_id', l_output_prefix + l_key, r_output_prefix + r_key]
    if l_output_attrs:
        s1 += [l_output_prefix + x for x in l_output_attrs if x != l_key]
    if r_output_attrs:
        s1 += [r_output_prefix + x for x in r_output_attrs if x != r_key]
    s1 = sorted(s1)
    assert_equal(s1, sorted(C.columns))
    assert_equal(em.get_key(C), '_id')
    assert_equal(em.get_property(C, 'fk_ltable'), l_output_prefix + l_key)
    assert_equal(em.get_property(C, 'fk_rtable'), r_output_prefix + r_key)
def validate_metadata(C, l_output_attrs=None, r_output_attrs=None,
                      l_output_prefix='ltable_', r_output_prefix='rtable_',
                      l_key='ID', r_key='ID'):
    s1 = ['_id', l_output_prefix + l_key, r_output_prefix + r_key]
    if l_output_attrs:
        s1 += [l_output_prefix + x for x in l_output_attrs if x != l_key]
    if r_output_attrs:
        s1 += [r_output_prefix + x for x in r_output_attrs if x != r_key]
    s1 = sorted(s1)
    assert_equal(s1, sorted(C.columns))
    assert_equal(em.get_key(C), '_id')
    assert_equal(em.get_property(C, 'fk_ltable'), l_output_prefix + l_key)
    assert_equal(em.get_property(C, 'fk_rtable'), r_output_prefix + r_key)
    def test_get_tokenized_table_1(self):
        A = read_csv_metadata(path_a, key='ID')
        A_key = em.get_key(A)
        feature_list = range(len(A.columns))
        actual_record_list = db._get_tokenized_table(A, A_key, feature_list)

        expected_record_list = [[('a1', 0), ('kevin', 1), ('smith', 1), ('1989', 2), ('30', 3), 
            ('607', 4), ('from', 4), ('st,', 4), ('san', 4), ('francisco', 4), ('94107',5)], 
            [('a2', 0), ('michael', 1), ('franklin', 1), ('1988', 2), ('28', 3), ('1652', 4),
            ('stockton', 4), ('st,', 4), ('san', 4), ('francisco', 4), ('94122', 5)], [('a3', 0),
            ('william', 1), ('bridge', 1), ('1986', 2), ('32', 3), ('3131', 4), ('webster', 4), 
            ('st,', 4), ('san', 4), ('francisco', 4), ('94107', 5)], [('a4', 0), ('binto', 1), 
            ('george', 1), ('1987', 2), ('32', 3), ('423', 4), ('powell', 4), ('st,', 4), 
            ('san', 4), ('francisco', 4), ('94122', 5)], [('a5', 0), ('alphonse', 1), ('kemper', 1),
            ('1984', 2), ('35', 3), ('1702', 4), ('post', 4), ('street,', 4), ('san', 4), 
            ('francisco', 4), ('94122', 5)]]

        self.assertEqual(actual_record_list, expected_record_list)
    def test_get_tokenized_table_1(self):
        A = read_csv_metadata(path_a, key='ID')
        A_key = em.get_key(A)
        feature_list = range(len(A.columns))
        actual_record_list = db._get_tokenized_table(A, A_key, feature_list)

        expected_record_list = [[('a1', 0), ('kevin', 1), ('smith', 1), ('1989', 2), ('30', 3), 
            ('607', 4), ('from', 4), ('st,', 4), ('san', 4), ('francisco', 4), ('94107',5)], 
            [('a2', 0), ('michael', 1), ('franklin', 1), ('1988', 2), ('28', 3), ('1652', 4),
            ('stockton', 4), ('st,', 4), ('san', 4), ('francisco', 4), ('94122', 5)], [('a3', 0),
            ('william', 1), ('bridge', 1), ('1986', 2), ('32', 3), ('3131', 4), ('webster', 4), 
            ('st,', 4), ('san', 4), ('francisco', 4), ('94107', 5)], [('a4', 0), ('binto', 1), 
            ('george', 1), ('1987', 2), ('32', 3), ('423', 4), ('powell', 4), ('st,', 4), 
            ('san', 4), ('francisco', 4), ('94122', 5)], [('a5', 0), ('alphonse', 1), ('kemper', 1),
            ('1984', 2), ('35', 3), ('1702', 4), ('post', 4), ('street,', 4), ('san', 4), 
            ('francisco', 4), ('94122', 5)]]

        self.assertEqual(actual_record_list, expected_record_list)
def validate_metadata_two_candsets(C, D): 
    assert_equal(sorted(C.columns), sorted(D.columns))
    assert_equal(em.get_key(D), em.get_key(C))
    assert_equal(em.get_property(D, 'fk_ltable'), em.get_property(C, 'fk_ltable'))
    assert_equal(em.get_property(D, 'fk_rtable'), em.get_property(C, 'fk_rtable'))
Ejemplo n.º 38
0
def main():
    # WELCOME TO MY MAGELLAN RUN SCRIPT
    print("\n-------------WELCOME TO MY MAGELLAN RUN SCRIPT-------------\n")

    # Get the datasets directory
    datasets_dir = 'B:\McMaster\CAS 764 - Advance Topics in Data Management\Project\Data\\'
    print("- Dataset directory: " + datasets_dir)
    print("- List of folders/files: ")
    print(os.listdir(datasets_dir))
    print("- Please enter new dataset folder name:")
    datasets_dir += input()
    print("- Dataset directory set to: " + datasets_dir)

    dateset_dir_files = os.listdir(datasets_dir)
    print("- List of files in dataset folder: ")
    print(dateset_dir_files)

    # Get the path of the input table A
    print("- Enter an index for Table A file (0-x):")
    file_index_A = input()
    filename_A = dateset_dir_files[int(file_index_A)]
    print("Table A file set to: " + filename_A)

    # Get the path of the input table
    path_A = datasets_dir + os.sep + filename_A

    # Get the path of the input table B
    print("- Enter an index for Table B file (0-x):")
    file_index_B = input()
    filename_B = dateset_dir_files[int(file_index_B)]
    print("Table B file set to: " + filename_B)

    # Get the path of the input table
    path_B = datasets_dir + os.sep + filename_B

    # Print Table A column names
    A = em.read_csv_metadata(path_A)
    print("- List of columns of Table A: ")
    print(list(A.columns))
    # Get the Table A id/primary key column name
    print('- Enter Table A primary key column index (ex. 0):')
    pk_A_index = input()
    pk_A = A.columns[int(pk_A_index)]

    # Print Table B column names
    B = em.read_csv_metadata(path_B)
    print("- List of columns of Table B: ")
    print(list(B.columns))
    # Get the Table B id/primary key column name
    print('- Enter Table B primary key column index (ex. 0):')
    pk_B_index = input()
    pk_B = A.columns[int(pk_A_index)]

    # READING TABLES AND SETTING METADATA
    print("\n-------------READING TABLES AND SETTING METADATA-------------\n")

    # Both read csv and set metadata id as ID column
    #A = em.read_csv_metadata(path_A, key=pk_A)
    #B = em.read_csv_metadata(path_B, key=pk_B)
    em.set_key(A, pk_A)
    em.set_key(B, pk_B)

    # Number of tables
    print('- Number of tuples in A: ' + str(len(A)))
    print('- Number of tuples in B: ' + str(len(B)))
    print('- Number of tuples in A X B (i.e the cartesian product): ' +
          str(len(A) * len(B)))

    # Print first 5 tuples of tables
    print(A.head())
    print(B.head())

    # Display the keys of the input tables
    print("- Table A primary key: " + em.get_key(A))
    print("- Table B primary key: " + em.get_key(B))

    # DOWNSAMPLING
    print("\n-------------DOWNSAMPING-------------\n")

    print("- Do you want to use downsampling? (y or n):")
    print("- Table A: " + str(len(A)) + ", Table B: " + str(len(B)))
    print("- NOTE: Recommended if both tables have 100K+ tuples.")
    is_downsample = input()
    if (is_downsample == 'y'):
        print("- Size of the downsampled tables (ex. 200):")
        downsample_size = input()
        # If the tables are large we can downsample the tables like this
        A1, B1 = em.down_sample(A, B, downsample_size, 1, show_progress=False)
        print("- Length of Table A1" + len(A1))
        print("- Length of Table B1" + len(B1))

    # BLOCKING
    print("\n-------------BLOCKING-------------\n")

    print("- Do you want to use blocking? (y or n):")
    is_blocking = input()
    if (is_blocking == 'y'):

        # Check if the 2 tables column names are the same
        if (list(A.columns) == list(B.columns)):
            C_attr_eq = []  # Attr Equ blocker result list
            C_overlap = []  # Overlap blocker result list
            C_blackbox = []  # BlackBox blocker result list

            # Left and right table attribute prefixes
            l_prefix = "ltable_"
            r_prefix = "rtable_"

            print("\n- List of columns: ")
            print(list(A.columns))
            # Labeling output table column selection
            print(
                "\n- Enter the indexes of columns that you want to see in labeling table (0-"
                + str(len(A.columns) - 1) + "):")
            out_attr = []
            for i in range(1, len(A.columns)):
                print("- Finish with empty character(enter+enter) " + str(i))
                add_to_attr = input()
                if (add_to_attr == ''):
                    break
                # Get indexes from user and add columns into out_attr list
                out_attr.append(A.columns[int(add_to_attr)])

            # Print output attributes
            print(out_attr)

            # Loop for adding/combining new blockers
            while (True):
                # Blocker selection
                print(
                    "\n- Do yo want to use Attribute Equivalence[ab] (same), Overlap[ob] (similar) or Blackbox[bb] blocker:"
                )
                blocker_selection = input()

                # ----- Attribute Equivalence Blocker -----
                if (blocker_selection == 'ab'):
                    # Create attribute equivalence blocker
                    ab = em.AttrEquivalenceBlocker()
                    # Counter for indexes
                    attr_eq_counter = 0
                    # Check if Overlap Blocker used before
                    if (C_overlap and not C_overlap[-1].empty):
                        print(
                            "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_attr_eq.append(
                                C_overlap[-1])  # Add last output of ob
                            attr_eq_counter += 1  # For skipping block_table function in first time

                    # Check if BlackBox Blocker used before
                    if (C_blackbox and not C_blackbox[-1].empty):
                        print(
                            "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_attr_eq.append(
                                C_blackbox[-1])  # Add last output of ob
                            attr_eq_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into Attr Equ blocker
                    while (True):
                        # List column names
                        print("\n- List of columns: ")
                        print(list(A.columns))
                        # Get blocking attribute/column
                        print(
                            "\n- Which column (w/ index) to use for equivalence blocking? (ex. 1):"
                        )
                        blocking_col_index = input()
                        blocking_col = A.columns[int(blocking_col_index)]

                        print(
                            "\n- Do you want to add missing values into blocking? (y or n):"
                        )
                        add_missing_val = input()
                        if (add_missing_val == 'y'):
                            add_missing_val = True
                        else:
                            add_missing_val = False

                        # First time using Attr Equ blocker, use A and B
                        if (attr_eq_counter == 0):
                            # Block using selected (blocking_col) attribute on A and B
                            C_attr_eq.append(
                                ab.block_tables(A,
                                                B,
                                                blocking_col,
                                                blocking_col,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                allow_missing=add_missing_val,
                                                n_jobs=-1))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block using selected (blocking_col) attribute on previous (last=-1) candidate set
                            C_attr_eq.append(
                                ab.block_candset(C_attr_eq[-1],
                                                 l_block_attr=blocking_col,
                                                 r_block_attr=blocking_col,
                                                 allow_missing=add_missing_val,
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print(
                            "\n- Attribute Equivalence Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_attr_eq[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        attr_eq_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_attr_eq[-1])))
                        print(
                            "- Add another column into Attribute Equivalence Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        ab_next_operation = input()
                        if (not ab_next_operation.islower()):
                            ab_next_operation = ab_next_operation.lower(
                            )  # Lower case
                        # Continue using Attribute Equivalence Blocker
                        if (ab_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (ab_next_operation == 'r'):
                            del C_attr_eq[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use Attribute Equivalence Blocker (y or n):"
                            )
                            ab_next_operation = input()
                            if (ab_next_operation == 'n'):
                                break
                        # Finish Attribute Equivalence Blocker
                        else:
                            break

                # ----- Overlap Blocker -----
                elif (blocker_selection == 'ob'):
                    # Create attribute equivalence blocker
                    ob = em.OverlapBlocker()
                    # Counter for indexes
                    overlap_counter = 0
                    # Check if Attribute Equivalence Blocker used before
                    if (C_attr_eq and not C_attr_eq[-1].empty):
                        print(
                            "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_overlap.append(
                                C_attr_eq[-1])  # Add last output of ab
                            overlap_counter += 1  # For skipping block_table function in first time

                    # Check if BlackBox Blocker used before
                    if (C_blackbox and not C_blackbox[-1].empty):
                        print(
                            "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_overlap.append(
                                C_blackbox[-1])  # Add last output of ob
                            overlap_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into Overlap blocker
                    while (True):
                        # List column names
                        print("- List of columns: ")
                        print(list(A.columns))
                        # Get blocking attribute/column
                        print(
                            "- Which column (w/ index) to use for overlap blocking? (ex. 1):"
                        )
                        blocking_col_index = input()
                        blocking_col = A.columns[int(blocking_col_index)]

                        print(
                            "\n- Do you want to add missing values into blocking? (y or n):"
                        )
                        add_missing_val = input()
                        if (add_missing_val == 'y'):
                            add_missing_val = True
                        else:
                            add_missing_val = False

                        print("\n- Use words as a token? (y or n):")
                        use_world_level = input()
                        if (use_world_level == 'y'):
                            use_world_level = True
                            q_gram_value = None
                        else:
                            use_world_level = False
                            print(
                                "\n- Q-gram q value (ex. 2 --> JO HN SM IT H):"
                            )
                            q_gram_value = input()
                            q_gram_value = int(q_gram_value)

                        print(
                            "\n- Enter the overlap size (# of tokens that overlap):"
                        )
                        overlap_size = input()
                        overlap_size = int(overlap_size)

                        print(
                            "\n- Do you want to remove (a, an, the) from token set? (y or n):"
                        )
                        use_stop_words = input()
                        if (use_stop_words == 'y'):
                            use_stop_words = True
                        else:
                            use_stop_words = False

                        # First time using Overlap blocker, use A and B
                        if (overlap_counter == 0):
                            # Block using selected (blocking_col) attribute on A and B
                            C_overlap.append(
                                ob.block_tables(A,
                                                B,
                                                blocking_col,
                                                blocking_col,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                rem_stop_words=use_stop_words,
                                                q_val=q_gram_value,
                                                word_level=use_world_level,
                                                overlap_size=overlap_size,
                                                allow_missing=add_missing_val,
                                                n_jobs=-1))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block using selected (blocking_col) attribute on previous (last=-1) candidate set
                            C_overlap.append(
                                ob.block_candset(C_overlap[-1],
                                                 l_overlap_attr=blocking_col,
                                                 r_overlap_attr=blocking_col,
                                                 rem_stop_words=use_stop_words,
                                                 q_val=q_gram_value,
                                                 word_level=use_world_level,
                                                 overlap_size=overlap_size,
                                                 allow_missing=add_missing_val,
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print("\n- Overlap Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_overlap[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        overlap_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_overlap[-1])))
                        print(
                            "- Add another column into Overlap Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        ob_next_operation = input()
                        if (not ob_next_operation.islower()):
                            ob_next_operation = ob_next_operation.lower(
                            )  # Lower case
                        # Continue using Overlap Blocker
                        if (ob_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (ob_next_operation == 'r'):
                            del C_overlap[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use Overlap Blocker (y or n):")
                            ob_next_operation = input()
                            if (ob_next_operation == 'n'):
                                break
                        # Finish Overlap Blocker
                        else:
                            break

                # ----- BlackBox Blocker -----
                elif (blocker_selection == 'bb'):
                    # Create attribute equivalence blocker
                    bb = em.BlackBoxBlocker()
                    # Counter for indexes
                    blackbox_counter = 0
                    # Check if Overlap Blocker used before
                    if (C_attr_eq and not C_attr_eq[-1].empty):
                        print(
                            "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_blackbox.append(
                                C_attr_eq[-1])  # Add last output of ob
                            blackbox_counter += 1  # For skipping block_table function in first time

                    # Check if Overlap Blocker used before
                    if (C_overlap and not C_overlap[-1].empty):
                        print(
                            "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_blackbox.append(
                                C_overlap[-1])  # Add last output of ob
                            blackbox_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into BlackBox blocker
                    while (True):
                        # Set function
                        bb.set_black_box_function(
                            number_10_percent_comparision)

                        # First time using Overlap blocker, use A and B
                        if (overlap_counter == 0):
                            # Block on A and B
                            C_blackbox.append(
                                bb.block_tables(A,
                                                B,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                n_jobs=-1,
                                                show_progress=False))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block on previous (last=-1) candidate set
                            C_blackbox.append(
                                bb.block_candset(C_blackbox[-1],
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print("\n- BlackBox Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_blackbox[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        blackbox_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_blackbox[-1])))
                        print(
                            "- Add another column into BlackBox Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        bb_next_operation = input()
                        if (not bb_next_operation.islower()):
                            bb_next_operation = bb_next_operation.lower(
                            )  # Lower case
                        # Continue using Overlap Blocker
                        if (bb_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (bb_next_operation == 'r'):
                            del C_blackbox[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use BlackBox Blocker (y or n):")
                            bb_next_operation = input()
                            if (bb_next_operation == 'n'):
                                break
                        # Finish BlackBox Blocker
                        else:
                            break

                print("\n- Do you want to add/use another blocker? (y or n):")
                blocker_decision = input()
                if (blocker_decision == 'n'):
                    break

            print(
                "\n- Which blocker output you want to use? (Attr Equ-ab, Overlap-ob, BlackBox-bb, Union-un)"
            )
            blocker_output_selection = input()
            # Attribute Equ
            if (blocker_output_selection == "ab"):
                C = C_attr_eq[-1]
            # Overlap
            elif (blocker_output_selection == "ob"):
                C = C_overlap[-1]
                # Overlap
            elif (blocker_output_selection == "bb"):
                C = C_blackbox[-1]
            # Union of blockers
            elif (blocker_output_selection == "un"):
                # Combine/union blockers candidate sets
                print("\n- TODO: Unions Attr Equ and Overlap only!")
                if (C_attr_eq and C_overlap and not C_attr_eq[-1].empty and
                        not C_overlap[-1].empty):  # Both blocker types used
                    C = em.combine_blocker_outputs_via_union(
                        [C_attr_eq[-1], C_overlap[-1]])
                    print(
                        "\n- Blockers candidate set outputs combined via union."
                    )
                else:  # Error
                    C = []
                    print(
                        "\n- ERROR: Candidate set C is empty! Check blockers' results."
                    )
            # Error
            else:
                C = []
                print(
                    "\n- ERROR: Candidate set C is empty! Check blockers' results."
                )
            print("\n- Length of C: " + str(len(C)))

        else:
            print(
                "\n- 2 Tables column names are different, they must be the same"
            )
            print(list(A.columns))
            print(list(B.columns))

    # SAMPLING&LABELING
    print("\n-------------SAMPLING&LABELING-------------\n")

    print("- Choose sampling size (eg. 450):")
    sampling_size = input()
    while (int(sampling_size) > len(C)):
        print("- Sampling size cannot be bigger than " + str(len(C)))
        sampling_size = input()

    # Sample  candidate set
    S = em.sample_table(C, int(sampling_size))

    print("- New window will pop-up for " + sampling_size + " sized table.")
    print("- If there is a match, change tuple's label value to 1")

    # Label S
    G = em.label_table(S, 'label')

    #DEVELOPMENT AND EVALUATION
    print("\n-------------DEVELOPMENT AND EVALUATION-------------\n")

    # Split S into development set (I) and evaluation set (J)
    IJ = em.split_train_test(G, train_proportion=0.7, random_state=0)
    I = IJ['train']
    J = IJ['test']

    #SELECTING THE BEST MATCHER
    print("\n-------------SELECTING THE BEST MATCHER-------------\n")

    # Create a set of ML-matchers
    dt = em.DTMatcher(name='DecisionTree', random_state=0)
    svm = em.SVMMatcher(name='SVM', random_state=0)
    rf = em.RFMatcher(name='RF', random_state=0)
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    ln = em.LinRegMatcher(name='LinReg')
    nb = em.NBMatcher(name='NaiveBayes')

    print(
        "\n- 6 different ML-matchers created: DL, SVM, RF, LogReg, LinReg, NB")

    print("\n- Creating features...")
    # Generate features
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)

    print("\n- Features list:")
    # List the names of the features generated
    print(feature_table['feature_name'])

    print("\n- Converting the development set to feature vectors...")
    # Convert the I into a set of feature vectors using feature_table
    H = em.extract_feature_vecs(I,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    print("\n- Feature table first rows:")
    # Display first few rows
    print(H.head())

    # Primary key of tables = prefix + pk = l_id, r_id
    ltable_pk = l_prefix + pk_A
    rtable_pk = r_prefix + pk_B

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(H))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        H = em.impute_table(
            H,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean',
            val_all_nans=0.0)
        #print("\n- Feature table first rows:")
        # Display first few rows
        #print(H.head())
        print("- Impute table function used for missing values.")

    print("\n- Selecting the best matcher using cross-validation...")
    # Select the best ML matcher using CV
    result = em.select_matcher(
        matchers=[dt, rf, svm, ln, lg, nb],
        table=H,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        k=5,
        target_attr='label',
        metric_to_select_matcher='f1',
        random_state=0)
    print("\n- Results:")
    print(result['cv_stats'])

    #DEBUGGING THE MATCHER
    print("\n-------------DEBUGGING THE MATCHER-------------\n")

    #  Split feature vectors into train and test
    UV = em.split_train_test(H, train_proportion=0.5)
    U = UV['train']
    V = UV['test']

    # Debug decision tree using GUI
    em.vis_debug_rf(rf,
                    U,
                    V,
                    exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
                    target_attr='label')

    print("\n- Do you want to add another feature?")

    H = em.extract_feature_vecs(I,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(H))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        H = em.impute_table(
            H,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean')
        print("\n- Feature table first rows:")
        # Display first few rows
        print(H.head())

    # Select the best ML matcher using CV
    result = em.select_matcher(
        [dt, rf, svm, ln, lg, nb],
        table=H,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        k=5,
        target_attr='label',
        metric_to_select_matcher='f1',
        random_state=0)

    print("\n- Results:")
    print(result['cv_stats'])

    #EVALUATING THE MATCHING OUTPUT
    print("\n-------------EVALUATING THE MATCHING OUTPUT-------------\n")

    print("\n- Converting the evaluation set to feature vectors...")
    # Convert J into a set of feature vectors using feature table
    L = em.extract_feature_vecs(J,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(L))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        L = em.impute_table(
            L,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean')
        print("\n- Feature table first rows:")
        # Display first few rows
        print(L.head())

    print("\n- Training the selected matcher...")
    # Train using feature vectors from I
    rf.fit(table=H,
           exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
           target_attr='label')

    print("\n- Predicting the matches...")
    # Predict on L
    predictions = rf.predict(
        table=L,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        append=True,
        target_attr='predicted',
        inplace=False)

    print("\n- Evaluating the prediction...")
    # Evaluate the predictions
    eval_result = em.eval_matches(predictions, 'label', 'predicted')
    print(em.print_eval_summary(eval_result))

    print("\n- Time elapsed:")
    print(datetime.now() - startTime)

    print("\n-------------END-------------\n")
Ejemplo n.º 39
0
path_A = '../data/A.csv'
path_B = '../data/B.csv'

# Load csv files as dataframes and set the key attribute in the dataframe
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')

print('Number of tuples in A: ' + str(len(A)))
print('Number of tuples in B: ' + str(len(B)))
print('Number of tuples in A X B (i.e the cartesian product): ' +
      str(len(A) * len(B)))

################################## Blocker Portion ##################################
print('Begin blocking stage')
# Display the key attributes of table A and B.
em.get_key(A), em.get_key(B)

# Create attribute equivalence blocker
ab = em.AttrEquivalenceBlocker()
# Block tables using 'year' attribute : same year include in candidate set
C1 = ab.block_tables(A,
                     B,
                     'Release Date',
                     'Release Date',
                     l_output_attrs=[
                         'Title', 'Genre', 'Score', 'Release Date', 'Rating',
                         'Directed By', 'Written By', 'Studio'
                     ],
                     r_output_attrs=[
                         'Title', 'Genre', 'Score', 'Release Date', 'Rating',
                         'Directed By', 'Written By', 'Studio'
 def test_build_id_to_index_map_1(self):
     A = read_csv_metadata(path_a, key='ID')
     key = em.get_key(A)
     actual_rec_id_to_idx = db._build_id_to_index_map(A, key)
     expected_rec_id_to_idx = {'a1': 0, 'a3': 2, 'a2': 1, 'a5': 4, 'a4': 3}
     self.assertEqual(actual_rec_id_to_idx, expected_rec_id_to_idx)
 def test_build_id_to_index_map_1(self):
     A = read_csv_metadata(path_a, key='ID')
     key = em.get_key(A)
     actual_rec_id_to_idx = db._build_id_to_index_map(A, key)
     expected_rec_id_to_idx = {'a1': 0, 'a3': 2, 'a2': 1, 'a5': 4, 'a4': 3}
     self.assertEqual(actual_rec_id_to_idx, expected_rec_id_to_idx)