def test_feature_fn_valid_nosim_tok(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = get_features_for_matching(A, B) len1 = len(feature_table) feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, dict(), dict())
def test_vis_debug_matcher_dt_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() train_test = mu.train_test_split(feature_vectors) train = train_test['train'] test = train_test['test'] _vis_debug_dt( dt, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels', show_window=False)
def test_feature_fn_valid_nosim_tok(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = get_features_for_matching(A, B) len1 = len(feature_table) feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, dict(), dict())
def test_get_features_for_matching_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feat_table = afg.get_features_for_matching(A, B) self.assertEqual(isinstance(feat_table, pd.DataFrame), True) functions = feat_table['function'] for f in functions: x = f(A.ix[1], B.ix[2]) self.assertEqual(x >= 0, True)
def test_get_features_for_matching_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feat_table = afg.get_features_for_matching(A, B) self.assertEqual(isinstance(feat_table, pd.DataFrame), True) functions = feat_table['function'] for f in functions: x = f(A.ix[1], B.ix[2]) self.assertEqual(x >= 0, True)
def test_extract_feature_vecs_invalid_feature_table(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) col_pos = len(C.columns) C.insert(col_pos, 'label', [0]*len(C)) feature_table = get_features_for_matching(A, B) F = extract_feature_vecs(C, attrs_before='ltable_name', feature_table=None, attrs_after=['label', '_id'])
def test_add_features_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = get_features_for_matching(A, B) len1 = len(feature_table) feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) add_feature(feature_table, 'test', f_dict) len2 = len(feature_table) self.assertEqual(len1+1, len2) self.assertEqual(feature_table.ix[len(feature_table)-1, 'function'](A.ix[1], B.ix[2]), 1.0)
def test_add_bb_feature_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = get_features_for_matching(A, B) def bb_fn(ltuple, rtuple): return 1.0 len1 = len(feature_table) add_blackbox_feature(feature_table, 'test', bb_fn) len2 = len(feature_table) self.assertEqual(len1+1, len2) self.assertEqual(feature_table.ix[len(feature_table)-1, 'function'](A.ix[1], B.ix[2]), 1.0)
def test_add_features_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = get_features_for_matching(A, B) len1 = len(feature_table) feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) add_feature(feature_table, 'test', f_dict) len2 = len(feature_table) self.assertEqual(len1 + 1, len2) self.assertEqual( feature_table.ix[len(feature_table) - 1, 'function'](A.ix[1], B.ix[2]), 1.0)
def test_add_bb_feature_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = get_features_for_matching(A, B) def bb_fn(ltuple, rtuple): return 1.0 len1 = len(feature_table) add_blackbox_feature(feature_table, 'test', bb_fn) len2 = len(feature_table) self.assertEqual(len1 + 1, len2) self.assertEqual( feature_table.ix[len(feature_table) - 1, 'function'](A.ix[1], B.ix[2]), 1.0)
def test_visualize_tree_invalid_df(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') visualize_tree(dt.clf, feature_vectors.columns, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
def test_extract_feature_vecs_valid_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) col_pos = len(C.columns) C.insert(col_pos, 'label', [0]*len(C)) feature_table = get_features_for_matching(A, B) F = extract_feature_vecs(C, attrs_before=['ltable_name', 'rtable_name'], feature_table=feature_table) self.assertEqual(isinstance(F, pd.DataFrame), True) self.assertEqual(F.columns[0], '_id') self.assertEqual(F.columns[1], cm.get_fk_ltable(C)) self.assertEqual(F.columns[2], cm.get_fk_rtable(C)) self.assertEqual(F.columns[3], 'ltable_name') self.assertEqual(F.columns[4], 'rtable_name') self.assertEqual(F.columns[len(F.columns)-1]=='label', False) self.assertEqual(cm.get_all_properties(C) == cm.get_all_properties(F), True)
def test_debug_dt_matcher_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') debug_decisiontree_matcher(dt, A.ix[1], B.ix[2], feat_table=feature_table, fv_columns=feature_vectors.columns, exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
def test_debug_rf_matcher_valid_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = RFMatcher() rf.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') debug_randomforest_matcher(rf.clf, A.ix[1], B.ix[2], feat_table=feature_table, fv_columns=feature_vectors.columns, exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
def test_debug_rf_matcher_invalid_feat_table(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = RFMatcher() rf.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') debug_randomforest_matcher(rf, A.ix[1], B.ix[2], feat_table=None, fv_columns=feature_vectors.columns, exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
def test_vis_tuple_debug_rf_matcher_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = RFMatcher() rf.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') s = pd.DataFrame(feature_vectors.ix[0]) s1 = s.T vis_tuple_debug_rf_matcher(rf, s1, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
def test_vis_debug_matcher_rf_label_col_wi_sp_name(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['_predicted'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='_predicted') rf = RFMatcher() train_test = mu.train_test_split(feature_vectors) train = train_test['train'] test = train_test['test'] _vis_debug_rf(rf, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], target_attr='_predicted', show_window=False)
def test_vis_tuple_debug_dt_matcher_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') feature_vectors.drop(['_id', 'ltable_ID', 'rtable_ID', 'labels'], axis=1, inplace=True) s = pd.DataFrame(feature_vectors.ix[0]) s1 = s.T vis_tuple_debug_dt_matcher(dt.clf, s1, exclude_attrs=None)
def test_visualize_tree_invalid_df(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') visualize_tree( dt.clf, feature_vectors.columns, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
def test_vis_debug_matcher_dt_tar_attr_notin_train(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() train_test = mu.train_test_split(feature_vectors) train = train_test['train'] test = train_test['test'] _vis_debug_dt(dt, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels1', show_window=False)
def test_vis_debug_matcher_rf_ex_attrs_notin_test(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = RFMatcher() train_test = mu.train_test_split(feature_vectors) train = train_test['train'] test = train_test['test'] test.drop('_id', inplace=True, axis=1) _vis_debug_rf(rf, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels', show_window=False)
def test_vis_tuple_debug_dt_matcher_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') feature_vectors.drop(['_id', 'ltable_ID', 'rtable_ID', 'labels'], axis=1, inplace=True) s = pd.DataFrame(feature_vectors.ix[0]) s1 = s.T vis_tuple_debug_dt_matcher(dt.clf, s1, exclude_attrs=None)
def test_get_features_for_matching_invalid_df1(self): # A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feat_table = afg.get_features_for_matching(None, B)
def test_get_features_for_matching_invalid_df2(self): A = read_csv_metadata(path_a) # B = read_csv_metadata(path_b, key='ID') feat_table = afg.get_features_for_matching(A, None)
def test_get_features_for_matching_invalid_df1(self): # A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feat_table = afg.get_features_for_matching(None, B)
def test_get_features_for_matching_invalid_df2(self): A = read_csv_metadata(path_a) # B = read_csv_metadata(path_b, key='ID') feat_table = afg.get_features_for_matching(A, None)