Python DTMatcherの例、py_entitymatching.matcher.dtmatcher.DTMatcher Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_matcher_ml_matcher.py プロジェクト: anhaidgroup/py_entitymatching

    def test_ml_matcher_return_probs_true_predict_diff_colname(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        train.drop('ltable.id', axis=1, inplace=True)
        train.drop('rtable.id', axis=1, inplace=True)
        test.drop('ltable.id', axis=1, inplace=True)
        test.drop('rtable.id', axis=1, inplace=True)
        test.drop('gold', axis=1, inplace=True)
        dt.fit(table=train, exclude_attrs='_id', target_attr='gold')
        predictions = dt.predict(table=test, exclude_attrs='_id',
                                 target_attr='predicted', probs_attr='probas',
                                 inplace=False, append=True, return_probs=True)

        self.assertNotEqual(id(predictions), id(test))
        self.assertEqual(len(predictions), len(test))
        self.assertEqual(set(list(test.columns)).issubset(list(predictions.columns)), True)

        p_col = predictions.columns[len(predictions.columns)-2]
        self.assertEqual(p_col, 'predicted')

        r_col = predictions.columns[len(predictions.columns) - 1]
        self.assertEqual(r_col, 'probas')

        self.assertEqual(sum((predictions[r_col] >= 0.0) & (predictions[r_col] <= 1.0)),
                         len(predictions))

コード例 #2

0

ファイルを表示

    def test_ml_matcher_inplace_false_predict(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        train.drop('ltable.id', axis=1, inplace=True)
        train.drop('rtable.id', axis=1, inplace=True)
        test.drop('ltable.id', axis=1, inplace=True)
        test.drop('rtable.id', axis=1, inplace=True)
        test.drop('gold', axis=1, inplace=True)
        dt.fit(table=train, exclude_attrs='_id', target_attr='gold')
        predictions = dt.predict(table=test,
                                 exclude_attrs='_id',
                                 target_attr='predicted',
                                 inplace=False,
                                 append=True)

        self.assertNotEqual(id(predictions), id(test))
        self.assertEqual(len(predictions), len(test))
        self.assertEqual(
            set(list(test.columns)).issubset(list(predictions.columns)), True)
        p_col = predictions.columns[len(predictions.columns) - 1]
        self.assertEqual(p_col, 'predicted')

コード例 #3

0

ファイルを表示

 def test_ml_matcher_invalid_input_combn_fit(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.split_train_test(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     dt.fit(x=train, table=train)

コード例 #4

0

ファイルを表示

ファイル: test_matcher_ml_matcher.py プロジェクト: anhaidgroup/py_entitymatching

 def test_ml_matcher_invalid_input_combn_fit(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.split_train_test(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     dt.fit(x=train, table=train)

コード例 #5

0

ファイルを表示

ファイル: test_matcher_ml_matcher.py プロジェクト: anhaidgroup/py_entitymatching

 def test_ml_matcher_invalid_df_predict(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.split_train_test(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold')
     _ = dt.predict(table="", exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                              target_attr='predicted',
                              append=True)

コード例 #6

0

ファイルを表示

ファイル: test_matcher_ml_matcher.py プロジェクト: anhaidgroup/py_entitymatching

 def test_ml_matcher_target_attr_not_present_fit(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.split_train_test(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     train.drop('ltable.id', axis=1, inplace=True)
     train.drop('rtable.id', axis=1, inplace=True)
     test.drop('ltable.id', axis=1, inplace=True)
     test.drop('rtable.id', axis=1, inplace=True)
     dt.fit(table=train, exclude_attrs='_id', target_attr='gold1')

コード例 #7

0

ファイルを表示

 def test_ml_matcher_target_attr_not_present_fit(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.split_train_test(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     train.drop('ltable.id', axis=1, inplace=True)
     train.drop('rtable.id', axis=1, inplace=True)
     test.drop('ltable.id', axis=1, inplace=True)
     test.drop('rtable.id', axis=1, inplace=True)
     dt.fit(table=train, exclude_attrs='_id', target_attr='gold1')

コード例 #8

0

ファイルを表示

 def test_ml_matcher_invalid_df_predict(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.split_train_test(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     dt.fit(table=train,
            exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
            target_attr='gold')
     _ = dt.predict(table="",
                    exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                    target_attr='predicted',
                    append=True)

コード例 #9

0

ファイルを表示

    def test_select_matcher_invalid_no_display_drill_down(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [
            dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher,
            logregmatcher
        ]

        result = select_matcher(
            matchers,
            x=None,
            y=None,
            table=feature_vectors,
            exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
            metrics_to_display=['precision'],
            target_attr='gold',
            k=7)
        result_df_p = result['drill_down_cv_stats']['recall']

コード例 #10

0

ファイルを表示

    def test_select_matcher_valid_cv_stats_3(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [
            dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher,
            logregmatcher
        ]

        result = select_matcher(
            matchers,
            x=None,
            y=None,
            table=feature_vectors,
            exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
            metric_to_select_matcher='recall',
            metrics_to_display='recall',
            target_attr='gold',
            k=7)
        header = ['Matcher', 'Average recall']
        result_df = result['cv_stats']
        result_df_r = result['drill_down_cv_stats']['recall']
        self.assertEqual(
            set(header) == set(list(result_df.columns[[0, 1]])), True)
        d = result_df.set_index('Matcher')
        p_max = d.loc[result['selected_matcher'].name, 'Average recall']
        a_max = np.max(result_df_r['Mean score'])
        self.assertEqual(p_max, a_max)

コード例 #11

0

ファイルを表示

    def test_select_matcher_valid_1(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
        #                       fk_rtable='rtable.id', key='_id')
        # C['labels'] = labels
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        # xgmatcher = XGBoostMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher,
                    logregmatcher]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                target_attr='gold', k=7)
        header = ['Name', 'Matcher', 'Num folds']
        result_df = result['drill_down_cv_stats']['precision']
        self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
        d = result_df.set_index('Name')
        p_max = d.loc[result['selected_matcher'].name, 'Mean score']
        a_max = pd.np.max(d['Mean score'])
        self.assertEqual(p_max, a_max)

コード例 #12

0

ファイルを表示

ファイル: test_matcher_ml_matcher.py プロジェクト: anhaidgroup/py_entitymatching

    def test_ml_matcher_valid_1(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id'], target_attr='gold')
        predictions = dt.predict(table=test, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                 target_attr='predicted',
                                 append=True)

        self.assertEqual(len(predictions), len(test))
        self.assertEqual(set(list(predictions.columns)).issubset(list(test.columns)), True)
        p_col = predictions.columns[len(predictions.columns)-1]
        self.assertEqual(p_col, 'predicted')

コード例 #13

0

ファイルを表示

    def test_select_matcher_valid_multiple_metrics(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                target_attr='gold', k=7)
        header = ['Name', 'Matcher', 'Num folds']
        result_df_p = result['drill_down_cv_stats']['precision']
        result_df_f = result['drill_down_cv_stats']['f1']
        result_df_r = result['drill_down_cv_stats']['recall']
        # Check header of precision dataframe
        self.assertEqual(set(header) == set(list(result_df_p.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df_p.columns[len(result_df_p.columns) - 1])
        # Check header of f1 dataframe
        self.assertEqual(set(header) == set(list(result_df_f.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df_f.columns[len(result_df_f.columns) - 1])
        # Check header of recall dataframe
        self.assertEqual(set(header) == set(list(result_df_r.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df_p.columns[len(result_df_r.columns) - 1])
        d = result_df_p.set_index('Name')
        p_max = d.loc[result['selected_matcher'].name, 'Mean score']
        a_max = pd.np.max(d['Mean score'])
        self.assertEqual(p_max, a_max)

コード例 #14

0

ファイルを表示

 def test_select_matcher_valid_2(self):
     A = read_csv_metadata(path_a, key='id')
     B = read_csv_metadata(path_b, key='id')
     # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
     #                       fk_rtable='rtable.id', key='_id')
     # labels = [0] * 7
     # labels.extend([1] * 8)
     # C['labels'] = labels
     # feature_table = get_features_for_matching(A, B)
     # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
     # feature_vectors.fillna(0, inplace=True)
     feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
     dtmatcher = DTMatcher()
     nbmatcher = NBMatcher()
     rfmatcher = RFMatcher()
     svmmatcher = SVMMatcher()
     linregmatcher = LinRegMatcher()
     logregmatcher = LogRegMatcher()
     matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]
     col_list = list(feature_vectors.columns)
     l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors),
                              cm.get_fk_rtable(feature_vectors),
                              'gold'])
     X = feature_vectors[l]
     Y = feature_vectors['gold']
     result = select_matcher(matchers, x=X, y=Y)
     header = ['Name', 'Matcher', 'Num folds']
     result_df = result['drill_down_cv_stats']['precision']
     self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
     self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
     d = result_df.set_index('Name')
     p_max = d.loc[result['selected_matcher'].name, 'Mean score']
     a_max = pd.np.max(d['Mean score'])
     self.assertEqual(p_max, a_max)

コード例 #15

0

ファイルを表示

ファイル: _test_debug_matcher_dt.py プロジェクト: stevemandala/py_entitymatching

    def test_vis_debug_matcher_dt_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')

        dt = DTMatcher()
        train_test = mu.split_train_test(feature_vectors)

        train = train_test['train']
        test = train_test['test']

        _vis_debug_dt(
            dt,
            train,
            test,
            exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
            target_attr='labels',
            show_window=False)

コード例 #16

0

ファイルを表示

ファイル: _test_debug_matcher_dt.py プロジェクト: anhaidgroup/py_entitymatching

    def test_visualize_tree_invalid_df(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')
        dt = DTMatcher()
        dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        visualize_tree(dt.clf, feature_vectors.columns, exclude_attrs=['_id', 'ltable_ID',
                                                                      'rtable_ID', 'labels'])

コード例 #17

0

ファイルを表示

 def test_select_matcher_target_attr_not_series(self):
     A = read_csv_metadata(path_a, key='id')
     B = read_csv_metadata(path_b, key='id')
     # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
     #                       fk_rtable='rtable.id', key='_id')
     # labels = [0] * 7
     # labels.extend([1] * 8)
     # C['labels'] = labels
     # feature_table = get_features_for_matching(A, B)
     # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
     # feature_vectors.fillna(0, inplace=True)
     feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
     dtmatcher = DTMatcher()
     nbmatcher = NBMatcher()
     rfmatcher = RFMatcher()
     svmmatcher = SVMMatcher()
     linregmatcher = LinRegMatcher()
     logregmatcher = LogRegMatcher()
     matchers = [
         dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher,
         logregmatcher
     ]
     col_list = list(feature_vectors.columns)
     l = list_diff(col_list, [
         cm.get_fk_ltable(feature_vectors),
         cm.get_fk_rtable(feature_vectors), 'gold'
     ])
     X = feature_vectors[l]
     Y = feature_vectors[['gold']]
     result = select_matcher(matchers, x=X, y=Y)

コード例 #18

0

ファイルを表示

ファイル: _test_debug_matcher_dt.py プロジェクト: anhaidgroup/py_entitymatching

    def test_debug_dt_matcher_valid(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')
        dt = DTMatcher()
        dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        debug_decisiontree_matcher(dt, A.ix[1], B.ix[2], feature_table=feature_table,
                                   table_columns=feature_vectors.columns,
                                   exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])

コード例 #19

0

ファイルを表示

ファイル: _test_debug_matcher_dt.py プロジェクト: stevemandala/py_entitymatching

 def test_vis_debug_matcher_dt_invalid_tar_attr(self):
     _vis_debug_dt(
         DTMatcher(),
         pd.DataFrame(),
         pd.DataFrame(),
         exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
         target_attr=None,
         show_window=False)

コード例 #20

0

ファイルを表示

ファイル: test_matcher_ml_matcher.py プロジェクト: anhaidgroup/py_entitymatching

    def test_ml_matcher_valid_2(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')

        col_list = list(feature_vectors.columns)
        l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors),
                                 cm.get_fk_rtable(feature_vectors),
                                 'gold'])
        X = train[l]
        Y = train['gold']

        dt.fit(x=X, y=Y)
        predictions = dt.predict(test[l])
        self.assertEqual(len(predictions), len(test))

コード例 #21

0

ファイルを表示

ファイル: test_matcher_ml_matcher.py プロジェクト: anhaidgroup/py_entitymatching

    def test_ml_matcher_append_false_predict(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        train.drop('ltable.id', axis=1, inplace=True)
        train.drop('rtable.id', axis=1, inplace=True)
        test.drop('ltable.id', axis=1, inplace=True)
        test.drop('rtable.id', axis=1, inplace=True)
        test.drop('gold', axis=1, inplace=True)
        dt.fit(table=train, exclude_attrs='_id', target_attr='gold')
        predictions = dt.predict(table=test, exclude_attrs='_id',
                                 target_attr='predicted',
                                 append=False)

        self.assertEqual(len(predictions), len(test))

コード例 #22

0

ファイルを表示

    def test_ml_matcher_valid_with_id_in_y(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')

        col_list = list(feature_vectors.columns)
        l = list_diff(col_list, [
            cm.get_fk_ltable(feature_vectors),
            cm.get_fk_rtable(feature_vectors), 'gold'
        ])
        X = train[l]
        Y = train[['_id', 'gold']]

        dt.fit(x=X, y=Y)
        predictions = dt.predict(test[l])
        self.assertEqual(len(predictions), len(test))

コード例 #23

0

ファイルを表示

ファイル: _test_debug_matcher_dt.py プロジェクト: anhaidgroup/py_entitymatching

    def test_vis_tuple_debug_dt_matcher_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')

        dt = DTMatcher()
        dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        s = pd.DataFrame(feature_vectors.ix[0])
        s1 = s.T
        vis_tuple_debug_dt_matcher(dt.clf, s1,
                                   exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])

コード例 #24

0

ファイルを表示

ファイル: _test_debug_matcher_dt.py プロジェクト: stevemandala/py_entitymatching

    def test_visualize_tree_invalid_df(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')
        dt = DTMatcher()
        dt.fit(table=feature_vectors,
               exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        visualize_tree(
            dt.clf,
            feature_vectors.columns,
            exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])

コード例 #25

0

ファイルを表示

ファイル: test_matcher_ml_matcher.py プロジェクト: anhaidgroup/py_entitymatching

    def test_ml_matcher_ex_attrs_not_list(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        train.drop('ltable.id', axis=1, inplace=True)
        train.drop('rtable.id', axis=1, inplace=True)
        test.drop('ltable.id', axis=1, inplace=True)
        test.drop('rtable.id', axis=1, inplace=True)
        dt.fit(table=train, exclude_attrs='_id', target_attr='gold')
        predictions = dt.predict(table=test, exclude_attrs=['_id', 'gold'],
                                 target_attr='predicted',
                                 append=True)

        self.assertEqual(len(predictions), len(test))
        l = len(set(list(predictions.columns)).difference(list(test.columns)))
        self.assertEqual(l, 0)
        p_col = predictions.columns[len(predictions.columns)-1]
        self.assertEqual(p_col, 'predicted')

コード例 #26

0

ファイルを表示

    def test_ml_matcher_target_attr_present_in_ex_attrs(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        dt.fit(table=train,
               exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
               target_attr='gold')
        predictions = dt.predict(
            table=test,
            exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
            target_attr='predicted',
            append=True)

        self.assertEqual(len(predictions), len(test))
        l = len(set(list(predictions.columns)).difference(list(test.columns)))
        self.assertEqual(l, 0)
        p_col = predictions.columns[len(predictions.columns) - 1]
        self.assertEqual(p_col, 'predicted')

コード例 #27

0

ファイルを表示

ファイル: _test_debug_matcher_dt.py プロジェクト: stevemandala/py_entitymatching

    def test_vis_tuple_debug_dt_matcher_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')

        dt = DTMatcher()
        dt.fit(table=feature_vectors,
               exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        s = pd.DataFrame(feature_vectors.ix[0])
        s1 = s.T
        vis_tuple_debug_dt_matcher(
            dt, s1, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])

コード例 #28

0

ファイルを表示

ファイル: _test_debug_matcher_dt.py プロジェクト: stevemandala/py_entitymatching

    def test_debug_dt_matcher_valid(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')
        dt = DTMatcher()
        dt.fit(table=feature_vectors,
               exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        debug_decisiontree_matcher(
            dt,
            A.ix[1],
            B.ix[2],
            feature_table=feature_table,
            table_columns=feature_vectors.columns,
            exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])

コード例 #29

0

ファイルを表示

    def test_valid_names_for_matchers(self):
        matchers1 = {
            "DT": DTMatcher(),
            "LinReg": LinRegMatcher(),
            "LogReg": LogRegMatcher(),
            "NB": NBMatcher(),
            "RF": RFMatcher(),
            "SVM": SVMMatcher()
        }

        matchers2 = {
            "DT": DTMatcher(name='temp'),
            "LinReg": LinRegMatcher(name='temp'),
            "LogReg": LogRegMatcher(name='temp'),
            "NB": NBMatcher(name='temp'),
            "RF": RFMatcher(name='temp'),
            "SVM": SVMMatcher(name='temp')
        }

        for m_name, matcher in six.iteritems(matchers1):
            self.assertEqual(isinstance(matcher.name, six.string_types), True)

        for m_name, matcher in six.iteritems(matchers2):
            self.assertEqual(matcher.name, 'temp')

コード例 #30

0

ファイルを表示

    def test_select_matcher_invalid_metric_to_select_matcher(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                metric_to_select_matcher='test',
                                target_attr='gold', k=7)

コード例 #31

0

ファイルを表示

 def test_ml_matcher_set_name(self):
     dt = DTMatcher()
     dt.set_name('Decision Tree')
     self.assertEqual(dt.get_name(), 'Decision Tree')

コード例 #32

0

ファイルを表示

 def test_ml_matcher_invalid_df_1(self):
     dt = DTMatcher(name='DecisionTree')
     dt.fit(x="", y="")

コード例 #33

0

ファイルを表示

ファイル: test_matcher_ml_matcher.py プロジェクト: anhaidgroup/py_entitymatching

 def test_ml_matcher_invalid_df(self):
     dt = DTMatcher(name='DecisionTree')
     dt.fit(table="", exclude_attrs=['ltable.id', 'rtable.id', '_id'], target_attr='gold')

コード例 #34

0

ファイルを表示

 def test_ml_matcher_invalid_df(self):
     dt = DTMatcher(name='DecisionTree')
     dt.fit(table="",
            exclude_attrs=['ltable.id', 'rtable.id', '_id'],
            target_attr='gold')

コード例 #35

0

ファイルを表示

ファイル: test_matcher_ml_matcher.py プロジェクト: anhaidgroup/py_entitymatching

 def test_ml_matcher_set_name(self):
     dt = DTMatcher()
     dt.set_name('Decision Tree')
     self.assertEqual(dt.get_name(), 'Decision Tree')

コード例 #36

0

ファイルを表示

ファイル: test_matcher_ml_matcher.py プロジェクト: anhaidgroup/py_entitymatching

 def test_ml_matcher_invalid_df_1(self):
     dt = DTMatcher(name='DecisionTree')
     dt.fit(x="", y="")

コード例 #37

0

ファイルを表示

ファイル: test_matcher_ml_matcher.py プロジェクト: anhaidgroup/py_entitymatching

 def test_ml_invalid_predict_sign(self):
     dt = DTMatcher(name='DecisionTree')
     dt.predict()

コード例 #38

0

ファイルを表示

 def test_ml_invalid_predict_sign(self):
     dt = DTMatcher(name='DecisionTree')
     dt.predict()