def test_ml_matcher_return_probs_true_predict_diff_colname(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        train.drop('ltable.id', axis=1, inplace=True)
        train.drop('rtable.id', axis=1, inplace=True)
        test.drop('ltable.id', axis=1, inplace=True)
        test.drop('rtable.id', axis=1, inplace=True)
        test.drop('gold', axis=1, inplace=True)
        dt.fit(table=train, exclude_attrs='_id', target_attr='gold')
        predictions = dt.predict(table=test, exclude_attrs='_id',
                                 target_attr='predicted', probs_attr='probas',
                                 inplace=False, append=True, return_probs=True)

        self.assertNotEqual(id(predictions), id(test))
        self.assertEqual(len(predictions), len(test))
        self.assertEqual(set(list(test.columns)).issubset(list(predictions.columns)), True)

        p_col = predictions.columns[len(predictions.columns)-2]
        self.assertEqual(p_col, 'predicted')

        r_col = predictions.columns[len(predictions.columns) - 1]
        self.assertEqual(r_col, 'probas')

        self.assertEqual(sum((predictions[r_col] >= 0.0) & (predictions[r_col] <= 1.0)),
                         len(predictions))
Ejemplo n.º 2
0
    def test_ml_matcher_inplace_false_predict(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        train.drop('ltable.id', axis=1, inplace=True)
        train.drop('rtable.id', axis=1, inplace=True)
        test.drop('ltable.id', axis=1, inplace=True)
        test.drop('rtable.id', axis=1, inplace=True)
        test.drop('gold', axis=1, inplace=True)
        dt.fit(table=train, exclude_attrs='_id', target_attr='gold')
        predictions = dt.predict(table=test,
                                 exclude_attrs='_id',
                                 target_attr='predicted',
                                 inplace=False,
                                 append=True)

        self.assertNotEqual(id(predictions), id(test))
        self.assertEqual(len(predictions), len(test))
        self.assertEqual(
            set(list(test.columns)).issubset(list(predictions.columns)), True)
        p_col = predictions.columns[len(predictions.columns) - 1]
        self.assertEqual(p_col, 'predicted')
    def test_vis_debug_matcher_rf_ex_attrs_notin_test(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')

        rf = RFMatcher()
        train_test = mu.split_train_test(feature_vectors)

        train = train_test['train']
        test = train_test['test']
        test.drop('_id', inplace=True, axis=1)
        _vis_debug_rf(
            rf,
            train,
            test,
            exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
            target_attr='labels',
            show_window=False)
    def test_vis_debug_matcher_dt_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')

        dt = DTMatcher()
        train_test = mu.split_train_test(feature_vectors)

        train = train_test['train']
        test = train_test['test']

        _vis_debug_dt(
            dt,
            train,
            test,
            exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
            target_attr='labels',
            show_window=False)
 def test_ml_matcher_invalid_input_combn_fit(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.split_train_test(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     dt.fit(x=train, table=train)
Ejemplo n.º 6
0
 def test_ml_matcher_invalid_input_combn_fit(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.split_train_test(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     dt.fit(x=train, table=train)
 def test_ml_matcher_invalid_df_predict(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.split_train_test(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold')
     _ = dt.predict(table="", exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                              target_attr='predicted',
                              append=True)
Ejemplo n.º 8
0
 def test_ml_matcher_target_attr_not_present_fit(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.split_train_test(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     train.drop('ltable.id', axis=1, inplace=True)
     train.drop('rtable.id', axis=1, inplace=True)
     test.drop('ltable.id', axis=1, inplace=True)
     test.drop('rtable.id', axis=1, inplace=True)
     dt.fit(table=train, exclude_attrs='_id', target_attr='gold1')
 def test_ml_matcher_target_attr_not_present_fit(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.split_train_test(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     train.drop('ltable.id', axis=1, inplace=True)
     train.drop('rtable.id', axis=1, inplace=True)
     test.drop('ltable.id', axis=1, inplace=True)
     test.drop('rtable.id', axis=1, inplace=True)
     dt.fit(table=train, exclude_attrs='_id', target_attr='gold1')
Ejemplo n.º 10
0
 def test_ml_matcher_invalid_df_predict(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.split_train_test(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     dt.fit(table=train,
            exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
            target_attr='gold')
     _ = dt.predict(table="",
                    exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                    target_attr='predicted',
                    append=True)
    def test_ml_matcher_valid_1(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id'], target_attr='gold')
        predictions = dt.predict(table=test, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                 target_attr='predicted',
                                 append=True)

        self.assertEqual(len(predictions), len(test))
        self.assertEqual(set(list(predictions.columns)).issubset(list(test.columns)), True)
        p_col = predictions.columns[len(predictions.columns)-1]
        self.assertEqual(p_col, 'predicted')
    def test_ml_matcher_valid_2(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')

        col_list = list(feature_vectors.columns)
        l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors),
                                 cm.get_fk_rtable(feature_vectors),
                                 'gold'])
        X = train[l]
        Y = train['gold']

        dt.fit(x=X, y=Y)
        predictions = dt.predict(test[l])
        self.assertEqual(len(predictions), len(test))
    def test_ml_matcher_append_false_predict(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        train.drop('ltable.id', axis=1, inplace=True)
        train.drop('rtable.id', axis=1, inplace=True)
        test.drop('ltable.id', axis=1, inplace=True)
        test.drop('rtable.id', axis=1, inplace=True)
        test.drop('gold', axis=1, inplace=True)
        dt.fit(table=train, exclude_attrs='_id', target_attr='gold')
        predictions = dt.predict(table=test, exclude_attrs='_id',
                                 target_attr='predicted',
                                 append=False)

        self.assertEqual(len(predictions), len(test))
Ejemplo n.º 14
0
    def test_ml_matcher_valid_with_id_in_y(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')

        col_list = list(feature_vectors.columns)
        l = list_diff(col_list, [
            cm.get_fk_ltable(feature_vectors),
            cm.get_fk_rtable(feature_vectors), 'gold'
        ])
        X = train[l]
        Y = train[['_id', 'gold']]

        dt.fit(x=X, y=Y)
        predictions = dt.predict(test[l])
        self.assertEqual(len(predictions), len(test))
 def test_train_test_split_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     result = mu.split_train_test(C)
     train = result['train']
     test = result['test']
     self.assertEqual(len(train)+len(test), len(C))
     p1 = cm.get_all_properties(C)
     p2 = cm.get_all_properties(train)
     p3 = cm.get_all_properties(test)
     # d = {}
     # d['ltable'] = A
     # d['rtable'] = A
     # d['key'] = '_id'
     # d['fk_ltable'] = 'ltable_ID'
     # d['fk_rtable'] = 'rtable_ID'
     self.assertEqual(p1 == p2, True)
     self.assertEqual(p1 == p3, True)
Ejemplo n.º 16
0
 def test_train_test_split_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     result = mu.split_train_test(C)
     train = result['train']
     test = result['test']
     self.assertEqual(len(train) + len(test), len(C))
     p1 = cm.get_all_properties(C)
     p2 = cm.get_all_properties(train)
     p3 = cm.get_all_properties(test)
     # d = {}
     # d['ltable'] = A
     # d['rtable'] = A
     # d['key'] = '_id'
     # d['fk_ltable'] = 'ltable_ID'
     # d['fk_rtable'] = 'rtable_ID'
     self.assertEqual(p1 == p2, True)
     self.assertEqual(p1 == p3, True)
    def test_vis_debug_matcher_rf_label_col_wi_sp_name(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['_predicted'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='_predicted')

        rf = RFMatcher()
        train_test = mu.split_train_test(feature_vectors)

        train = train_test['train']
        test = train_test['test']
        _vis_debug_rf(rf, train, test,
                      exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
                      target_attr='_predicted', show_window=False)
    def test_vis_debug_matcher_dt_tar_attr_notin_train(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')

        dt = DTMatcher()
        train_test = mu.split_train_test(feature_vectors)

        train = train_test['train']
        test = train_test['test']
        _vis_debug_dt(dt, train, test,
                      exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
                      target_attr='labels1', show_window=False)
    def test_vis_debug_matcher_rf_ex_attrs_notin_test(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')

        rf = RFMatcher()
        train_test = mu.split_train_test(feature_vectors)

        train = train_test['train']
        test = train_test['test']
        test.drop('_id', inplace=True, axis=1)
        _vis_debug_rf(rf, train, test,
                      exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
                      target_attr='labels', show_window=False)
Ejemplo n.º 20
0
    def test_ml_matcher_target_attr_present_in_ex_attrs(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        dt.fit(table=train,
               exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
               target_attr='gold')
        predictions = dt.predict(
            table=test,
            exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
            target_attr='predicted',
            append=True)

        self.assertEqual(len(predictions), len(test))
        l = len(set(list(predictions.columns)).difference(list(test.columns)))
        self.assertEqual(l, 0)
        p_col = predictions.columns[len(predictions.columns) - 1]
        self.assertEqual(p_col, 'predicted')
    def test_ml_matcher_ex_attrs_not_list(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        train.drop('ltable.id', axis=1, inplace=True)
        train.drop('rtable.id', axis=1, inplace=True)
        test.drop('ltable.id', axis=1, inplace=True)
        test.drop('rtable.id', axis=1, inplace=True)
        dt.fit(table=train, exclude_attrs='_id', target_attr='gold')
        predictions = dt.predict(table=test, exclude_attrs=['_id', 'gold'],
                                 target_attr='predicted',
                                 append=True)

        self.assertEqual(len(predictions), len(test))
        l = len(set(list(predictions.columns)).difference(list(test.columns)))
        self.assertEqual(l, 0)
        p_col = predictions.columns[len(predictions.columns)-1]
        self.assertEqual(p_col, 'predicted')
    def test_vis_debug_matcher_rf_label_col_wi_sp_name(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['_predicted'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='_predicted')

        rf = RFMatcher()
        train_test = mu.split_train_test(feature_vectors)

        train = train_test['train']
        test = train_test['test']
        _vis_debug_rf(rf,
                      train,
                      test,
                      exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
                      target_attr='_predicted',
                      show_window=False)
 def test_train_test_split_invalid_df(self):
     mu.split_train_test(None)
Ejemplo n.º 24
0
 def test_train_test_split_invalid_df(self):
     mu.split_train_test(None)