def test_vis_debug_matcher_dt_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')

        dt = DTMatcher()
        train_test = mu.split_train_test(feature_vectors)

        train = train_test['train']
        test = train_test['test']

        _vis_debug_dt(
            dt,
            train,
            test,
            exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
            target_attr='labels',
            show_window=False)
    def test_vis_debug_matcher_rf_ex_attrs_notin_test(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')

        rf = RFMatcher()
        train_test = mu.split_train_test(feature_vectors)

        train = train_test['train']
        test = train_test['test']
        test.drop('_id', inplace=True, axis=1)
        _vis_debug_rf(
            rf,
            train,
            test,
            exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
            target_attr='labels',
            show_window=False)
 def test_extract_feature_vecs_invalid_feature_table(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0] * len(C))
     feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False)
     F = extract_feature_vecs(C, attrs_before='ltable_name',
                              feature_table=None,
                              attrs_after=['label', '_id'])
 def test_extract_feature_vecs_invalid_feature_table(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0] * len(C))
     feature_table = get_features_for_matching(A, B)
     F = extract_feature_vecs(C,
                              attrs_before='ltable_name',
                              feature_table=None,
                              attrs_after=['label', '_id'])
Beispiel #5
0
 def test_extract_feature_vecs_invalid_attrs_after(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0] * len(C))
     feature_table = get_features_for_matching(
         A, B, validate_inferred_attr_types=False)
     F = extract_feature_vecs(
         C,
         attrs_before='ltable_name',
         feature_table=pd.DataFrame(columns=feature_table.columns),
         attrs_after=['label1', '_id'])
 def test_extract_feature_vecs_with_parralel_job_count_less_than_zero(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0] * len(C))
     feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False)
     F = extract_feature_vecs(C, attrs_before=['ltable_name', 'rtable_name'], feature_table=feature_table, n_jobs=-1)
     self.assertEqual(isinstance(F, pd.DataFrame), True)
     self.assertEqual(F.columns[0], '_id')
     self.assertEqual(F.columns[1], cm.get_fk_ltable(C))
     self.assertEqual(F.columns[2], cm.get_fk_rtable(C))
     self.assertEqual(F.columns[4], 'rtable_name')
     self.assertEqual(F.columns[len(F.columns) - 1] == 'label', False)
     self.assertEqual(cm.get_all_properties(C) == cm.get_all_properties(F), True)
    def test_visualize_tree_invalid_df(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')
        dt = DTMatcher()
        dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        visualize_tree(dt.clf, feature_vectors.columns, exclude_attrs=['_id', 'ltable_ID',
                                                                      'rtable_ID', 'labels'])
 def test_extract_feature_vecs_valid_8(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0] * len(C))
     feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False)
     F = extract_feature_vecs(C,
                              feature_table=pd.DataFrame(columns=feature_table.columns),
                              attrs_after=['label', '_id'])
     self.assertEqual(isinstance(F, pd.DataFrame), True)
     self.assertEqual(F.columns[0], '_id')
     self.assertEqual(F.columns[1], cm.get_fk_ltable(C))
     self.assertEqual(F.columns[2], cm.get_fk_rtable(C))
     # self.assertEqual(F.columns[3], 'ltable_name')
     self.assertEqual(F.columns[len(F.columns) - 1] == 'label', True)
     self.assertEqual(cm.get_all_properties(C) == cm.get_all_properties(F), True)
    def test_debug_dt_matcher_valid(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')
        dt = DTMatcher()
        dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        debug_decisiontree_matcher(dt, A.ix[1], B.ix[2], feature_table=feature_table,
                                   table_columns=feature_vectors.columns,
                                   exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
    def test_debug_rf_matcher_invalid_feat_table(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')
        rf = RFMatcher()
        rf.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')

        debug_randomforest_matcher(rf, A.ix[1], B.ix[2], feature_table=None,
                                   table_columns=feature_vectors.columns,
                                   exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
 def test_extract_feature_vecs_valid_2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0] * len(C))
     feature_table = get_features_for_matching(A, B)
     F = extract_feature_vecs(C,
                              attrs_before=['ltable_name', 'rtable_name'],
                              feature_table=feature_table)
     self.assertEqual(isinstance(F, pd.DataFrame), True)
     self.assertEqual(F.columns[0], '_id')
     self.assertEqual(F.columns[1], cm.get_fk_ltable(C))
     self.assertEqual(F.columns[2], cm.get_fk_rtable(C))
     self.assertEqual(F.columns[3], 'ltable_name')
     self.assertEqual(F.columns[4], 'rtable_name')
     self.assertEqual(F.columns[len(F.columns) - 1] == 'label', False)
     self.assertEqual(
         cm.get_all_properties(C) == cm.get_all_properties(F), True)
    def test_vis_tuple_debug_rf_matcher_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')

        rf = RFMatcher()
        rf.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        s = pd.DataFrame(feature_vectors.ix[0])
        s1 = s.T
        vis_tuple_debug_rf_matcher(rf, s1,
                                   exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
Beispiel #13
0
 def test_extract_feature_vecs_valid_8(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0] * len(C))
     feature_table = get_features_for_matching(
         A, B, validate_inferred_attr_types=False)
     F = extract_feature_vecs(
         C,
         feature_table=pd.DataFrame(columns=feature_table.columns),
         attrs_after=['label', '_id'])
     self.assertEqual(isinstance(F, pd.DataFrame), True)
     self.assertEqual(F.columns[0], '_id')
     self.assertEqual(F.columns[1], cm.get_fk_ltable(C))
     self.assertEqual(F.columns[2], cm.get_fk_rtable(C))
     # self.assertEqual(F.columns[3], 'ltable_name')
     self.assertEqual(F.columns[len(F.columns) - 1] == 'label', True)
     self.assertEqual(
         cm.get_all_properties(C) == cm.get_all_properties(F), True)
    def test_vis_debug_matcher_rf_label_col_wi_sp_name(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['_predicted'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='_predicted')

        rf = RFMatcher()
        train_test = mu.split_train_test(feature_vectors)

        train = train_test['train']
        test = train_test['test']
        _vis_debug_rf(rf, train, test,
                      exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
                      target_attr='_predicted', show_window=False)
Beispiel #15
0
 def test_extract_feature_vecs_with_parralel_job_count_more_than_one(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0] * len(C))
     feature_table = get_features_for_matching(
         A, B, validate_inferred_attr_types=False)
     F = extract_feature_vecs(C,
                              attrs_before=['ltable_name', 'rtable_name'],
                              feature_table=feature_table,
                              n_jobs=2)
     self.assertEqual(isinstance(F, pd.DataFrame), True)
     self.assertEqual(F.columns[0], '_id')
     self.assertEqual(F.columns[1], cm.get_fk_ltable(C))
     self.assertEqual(F.columns[2], cm.get_fk_rtable(C))
     self.assertEqual(F.columns[4], 'rtable_name')
     self.assertEqual(F.columns[len(F.columns) - 1] == 'label', False)
     self.assertEqual(
         cm.get_all_properties(C) == cm.get_all_properties(F), True)
    def test_vis_debug_matcher_dt_tar_attr_notin_train(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')

        dt = DTMatcher()
        train_test = mu.split_train_test(feature_vectors)

        train = train_test['train']
        test = train_test['test']
        _vis_debug_dt(dt, train, test,
                      exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
                      target_attr='labels1', show_window=False)
    def test_visualize_tree_invalid_df(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')
        dt = DTMatcher()
        dt.fit(table=feature_vectors,
               exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        visualize_tree(
            dt.clf,
            feature_vectors.columns,
            exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
    def test_vis_tuple_debug_dt_matcher_valid_3(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')

        dt = DTMatcher()
        dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        feature_vectors.drop(['_id', 'ltable_ID', 'rtable_ID', 'labels'], axis=1, inplace=True)
        s = pd.DataFrame(feature_vectors.ix[0])
        s1 = s.T

        vis_tuple_debug_dt_matcher(dt.clf, s1, exclude_attrs=None)
    def test_vis_debug_matcher_rf_ex_attrs_notin_test(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')

        rf = RFMatcher()
        train_test = mu.split_train_test(feature_vectors)

        train = train_test['train']
        test = train_test['test']
        test.drop('_id', inplace=True, axis=1)
        _vis_debug_rf(rf, train, test,
                      exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
                      target_attr='labels', show_window=False)
    def test_vis_tuple_debug_dt_matcher_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')

        dt = DTMatcher()
        dt.fit(table=feature_vectors,
               exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        s = pd.DataFrame(feature_vectors.ix[0])
        s1 = s.T
        vis_tuple_debug_dt_matcher(
            dt, s1, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
    def test_debug_dt_matcher_valid(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')
        dt = DTMatcher()
        dt.fit(table=feature_vectors,
               exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        debug_decisiontree_matcher(
            dt,
            A.ix[1],
            B.ix[2],
            feature_table=feature_table,
            table_columns=feature_vectors.columns,
            exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
    def test_vis_tuple_debug_rf_matcher_valid_3(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')

        rf = RFMatcher()
        rf.fit(table=feature_vectors,
               exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        feature_vectors.drop(['_id', 'ltable_ID', 'rtable_ID', 'labels'],
                             axis=1,
                             inplace=True)
        s = pd.DataFrame(feature_vectors.ix[0])
        s1 = s.T

        vis_tuple_debug_rf_matcher(rf.clf, s1, exclude_attrs=None)
    def test_vis_debug_matcher_rf_label_col_wi_sp_name(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['_predicted'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='_predicted')

        rf = RFMatcher()
        train_test = mu.split_train_test(feature_vectors)

        train = train_test['train']
        test = train_test['test']
        _vis_debug_rf(rf,
                      train,
                      test,
                      exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
                      target_attr='_predicted',
                      show_window=False)
    def test_debug_rf_matcher_invalid_feat_table(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')
        rf = RFMatcher()
        rf.fit(table=feature_vectors,
               exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')

        debug_randomforest_matcher(
            rf,
            A.ix[1],
            B.ix[2],
            feature_table=None,
            table_columns=feature_vectors.columns,
            exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
 def test_extract_feature_vecs_invalid_df(self):
     F = extract_feature_vecs(None, attrs_before='ltable_name',
                              feature_table=pd.DataFrame(),
                              attrs_after=['label', '_id'])
 def test_extract_feature_vecs_invalid_df(self):
     F = extract_feature_vecs(None,
                              attrs_before='ltable_name',
                              feature_table=pd.DataFrame(),
                              attrs_after=['label', '_id'])