def test_ml_matcher_return_probs_true_predict_diff_colname(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') train.drop('ltable.id', axis=1, inplace=True) train.drop('rtable.id', axis=1, inplace=True) test.drop('ltable.id', axis=1, inplace=True) test.drop('rtable.id', axis=1, inplace=True) test.drop('gold', axis=1, inplace=True) dt.fit(table=train, exclude_attrs='_id', target_attr='gold') predictions = dt.predict(table=test, exclude_attrs='_id', target_attr='predicted', probs_attr='probas', inplace=False, append=True, return_probs=True) self.assertNotEqual(id(predictions), id(test)) self.assertEqual(len(predictions), len(test)) self.assertEqual(set(list(test.columns)).issubset(list(predictions.columns)), True) p_col = predictions.columns[len(predictions.columns)-2] self.assertEqual(p_col, 'predicted') r_col = predictions.columns[len(predictions.columns) - 1] self.assertEqual(r_col, 'probas') self.assertEqual(sum((predictions[r_col] >= 0.0) & (predictions[r_col] <= 1.0)), len(predictions))
def test_ml_matcher_inplace_false_predict(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') train.drop('ltable.id', axis=1, inplace=True) train.drop('rtable.id', axis=1, inplace=True) test.drop('ltable.id', axis=1, inplace=True) test.drop('rtable.id', axis=1, inplace=True) test.drop('gold', axis=1, inplace=True) dt.fit(table=train, exclude_attrs='_id', target_attr='gold') predictions = dt.predict(table=test, exclude_attrs='_id', target_attr='predicted', inplace=False, append=True) self.assertNotEqual(id(predictions), id(test)) self.assertEqual(len(predictions), len(test)) self.assertEqual( set(list(test.columns)).issubset(list(predictions.columns)), True) p_col = predictions.columns[len(predictions.columns) - 1] self.assertEqual(p_col, 'predicted')
def test_vis_debug_matcher_rf_ex_attrs_notin_test(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = RFMatcher() train_test = mu.split_train_test(feature_vectors) train = train_test['train'] test = train_test['test'] test.drop('_id', inplace=True, axis=1) _vis_debug_rf( rf, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels', show_window=False)
def test_vis_debug_matcher_dt_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() train_test = mu.split_train_test(feature_vectors) train = train_test['train'] test = train_test['test'] _vis_debug_dt( dt, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels', show_window=False)
def test_ml_matcher_invalid_input_combn_fit(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(x=train, table=train)
def test_ml_matcher_invalid_input_combn_fit(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(x=train, table=train)
def test_ml_matcher_invalid_df_predict(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold') _ = dt.predict(table="", exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='predicted', append=True)
def test_ml_matcher_target_attr_not_present_fit(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') train.drop('ltable.id', axis=1, inplace=True) train.drop('rtable.id', axis=1, inplace=True) test.drop('ltable.id', axis=1, inplace=True) test.drop('rtable.id', axis=1, inplace=True) dt.fit(table=train, exclude_attrs='_id', target_attr='gold1')
def test_ml_matcher_target_attr_not_present_fit(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') train.drop('ltable.id', axis=1, inplace=True) train.drop('rtable.id', axis=1, inplace=True) test.drop('ltable.id', axis=1, inplace=True) test.drop('rtable.id', axis=1, inplace=True) dt.fit(table=train, exclude_attrs='_id', target_attr='gold1')
def test_ml_matcher_invalid_df_predict(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold') _ = dt.predict(table="", exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='predicted', append=True)
def test_ml_matcher_valid_1(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id'], target_attr='gold') predictions = dt.predict(table=test, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='predicted', append=True) self.assertEqual(len(predictions), len(test)) self.assertEqual(set(list(predictions.columns)).issubset(list(test.columns)), True) p_col = predictions.columns[len(predictions.columns)-1] self.assertEqual(p_col, 'predicted')
def test_ml_matcher_valid_2(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') col_list = list(feature_vectors.columns) l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors), 'gold']) X = train[l] Y = train['gold'] dt.fit(x=X, y=Y) predictions = dt.predict(test[l]) self.assertEqual(len(predictions), len(test))
def test_ml_matcher_append_false_predict(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') train.drop('ltable.id', axis=1, inplace=True) train.drop('rtable.id', axis=1, inplace=True) test.drop('ltable.id', axis=1, inplace=True) test.drop('rtable.id', axis=1, inplace=True) test.drop('gold', axis=1, inplace=True) dt.fit(table=train, exclude_attrs='_id', target_attr='gold') predictions = dt.predict(table=test, exclude_attrs='_id', target_attr='predicted', append=False) self.assertEqual(len(predictions), len(test))
def test_ml_matcher_valid_with_id_in_y(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') col_list = list(feature_vectors.columns) l = list_diff(col_list, [ cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors), 'gold' ]) X = train[l] Y = train[['_id', 'gold']] dt.fit(x=X, y=Y) predictions = dt.predict(test[l]) self.assertEqual(len(predictions), len(test))
def test_train_test_split_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) result = mu.split_train_test(C) train = result['train'] test = result['test'] self.assertEqual(len(train)+len(test), len(C)) p1 = cm.get_all_properties(C) p2 = cm.get_all_properties(train) p3 = cm.get_all_properties(test) # d = {} # d['ltable'] = A # d['rtable'] = A # d['key'] = '_id' # d['fk_ltable'] = 'ltable_ID' # d['fk_rtable'] = 'rtable_ID' self.assertEqual(p1 == p2, True) self.assertEqual(p1 == p3, True)
def test_train_test_split_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) result = mu.split_train_test(C) train = result['train'] test = result['test'] self.assertEqual(len(train) + len(test), len(C)) p1 = cm.get_all_properties(C) p2 = cm.get_all_properties(train) p3 = cm.get_all_properties(test) # d = {} # d['ltable'] = A # d['rtable'] = A # d['key'] = '_id' # d['fk_ltable'] = 'ltable_ID' # d['fk_rtable'] = 'rtable_ID' self.assertEqual(p1 == p2, True) self.assertEqual(p1 == p3, True)
def test_vis_debug_matcher_rf_label_col_wi_sp_name(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['_predicted'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='_predicted') rf = RFMatcher() train_test = mu.split_train_test(feature_vectors) train = train_test['train'] test = train_test['test'] _vis_debug_rf(rf, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], target_attr='_predicted', show_window=False)
def test_vis_debug_matcher_dt_tar_attr_notin_train(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() train_test = mu.split_train_test(feature_vectors) train = train_test['train'] test = train_test['test'] _vis_debug_dt(dt, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels1', show_window=False)
def test_vis_debug_matcher_rf_ex_attrs_notin_test(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = RFMatcher() train_test = mu.split_train_test(feature_vectors) train = train_test['train'] test = train_test['test'] test.drop('_id', inplace=True, axis=1) _vis_debug_rf(rf, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels', show_window=False)
def test_ml_matcher_target_attr_present_in_ex_attrs(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold') predictions = dt.predict( table=test, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='predicted', append=True) self.assertEqual(len(predictions), len(test)) l = len(set(list(predictions.columns)).difference(list(test.columns))) self.assertEqual(l, 0) p_col = predictions.columns[len(predictions.columns) - 1] self.assertEqual(p_col, 'predicted')
def test_ml_matcher_ex_attrs_not_list(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') train.drop('ltable.id', axis=1, inplace=True) train.drop('rtable.id', axis=1, inplace=True) test.drop('ltable.id', axis=1, inplace=True) test.drop('rtable.id', axis=1, inplace=True) dt.fit(table=train, exclude_attrs='_id', target_attr='gold') predictions = dt.predict(table=test, exclude_attrs=['_id', 'gold'], target_attr='predicted', append=True) self.assertEqual(len(predictions), len(test)) l = len(set(list(predictions.columns)).difference(list(test.columns))) self.assertEqual(l, 0) p_col = predictions.columns[len(predictions.columns)-1] self.assertEqual(p_col, 'predicted')
def test_vis_debug_matcher_rf_label_col_wi_sp_name(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['_predicted'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='_predicted') rf = RFMatcher() train_test = mu.split_train_test(feature_vectors) train = train_test['train'] test = train_test['test'] _vis_debug_rf(rf, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], target_attr='_predicted', show_window=False)
def test_train_test_split_invalid_df(self): mu.split_train_test(None)
def test_train_test_split_invalid_df(self): mu.split_train_test(None)