def test_eval_matches_valid_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = 1 num_zeros = len(C1) - num_ones gold = [0] * num_ones gold.extend([1] * num_zeros) predicted = [1] * (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln + 1, 'predicted', predicted) cm.copy_properties(C, C1) result = eval_matches(C1, 'predicted', 'gold') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 14) self.assertEqual(result['prec_denominator'], 14) self.assertAlmostEqual(result['precision'], 1) self.assertEqual(result['recall_numerator'], 14) self.assertEqual(result['recall_denominator'], 15) self.assertEqual(result['recall'], 0.9333333333333333) self.assertEqual(result['f1'], 0.9655172413793104) self.assertEqual(result['pred_pos_num'], 14) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 1) self.assertEqual(result['false_neg_num'], 1.0) self.assertEqual(len(result['false_neg_ls']), 1) t = result['false_neg_ls'][0] self.assertEqual(t[0], 'a1') self.assertEqual(t[1], 'b1')
def test_eval_matches_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = len(C1) num_zeros = len(C1) - num_ones gold = [0] * num_ones # gold.extend([1]*num_zeros) predicted = [1] * (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln + 1, 'predicted', predicted) D = pd.DataFrame(columns=C1.columns) cm.copy_properties(C, D) result = eval_matches(D, 'gold', 'predicted') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 0) self.assertEqual(result['prec_denominator'], 0) self.assertAlmostEqual(result['precision'], 0) self.assertEqual(result['recall_numerator'], 0) self.assertEqual(result['recall_denominator'], 0) self.assertEqual(result['recall'], 0) self.assertEqual(result['f1'], 0) self.assertEqual(result['pred_pos_num'], 0) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 0) self.assertEqual(result['false_neg_num'], 0.0) self.assertEqual(len(result['false_neg_ls']), 0)
def test_eval_matches_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = len(C1) num_zeros = len(C1) - num_ones gold = [0]*num_ones # gold.extend([1]*num_zeros) predicted = [1]* (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln+1, 'predicted', predicted) D = pd.DataFrame(columns=C1.columns) cm.copy_properties(C, D) result = eval_matches(D, 'gold', 'predicted') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 0) self.assertEqual(result['prec_denominator'], 0) self.assertAlmostEqual(result['precision'], 0) self.assertEqual(result['recall_numerator'], 0) self.assertEqual(result['recall_denominator'], 0) self.assertEqual(result['recall'], 0) self.assertEqual(result['f1'], 0) self.assertEqual(result['pred_pos_num'], 0) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 0) self.assertEqual(result['false_neg_num'], 0.0) self.assertEqual(len(result['false_neg_ls']), 0)
def test_eval_matches_predicted_attr_not_in_df(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = 1 num_zeros = len(C1) - num_ones gold = [0] * num_ones gold.extend([1] * num_zeros) predicted = [1] * (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln + 1, 'predicted', predicted) cm.copy_properties(C, C1) result = eval_matches(C1, 'gold', 'predicted1')
def _vis_debug_dt(matcher, train, test, exclude_attrs, target_attr, show_window=True): """ Wrapper function for debugging the Random Forest matcher visually. """ try: from PyQt5 import QtWidgets from py_entitymatching.gui.debug_gui_base import MainWindowManager except ImportError: raise ImportError( 'PyQt5 is not installed. Please install PyQt5 to use ' 'GUI related functions in py_entitymatching.') # Validate the input parameters # # We expect the matcher to be of type DTMatcher if not isinstance(matcher, DTMatcher): logger.error('Input matcher is not of type Decision Tree matcher') raise AssertionError('Input matcher is not of type ' 'Decision Tree matcher') # # We expect the target attribute to be of type string. validate_object_type(target_attr, six.string_types, error_prefix='Target attribute') # # Check whether the exclude attributes are indeed present in the train # DataFrame. if not ch.check_attrs_present(train, exclude_attrs): logger.error('The exclude attrs are not in train table columns') raise AssertionError('The exclude attrs are not in the ' 'train table columns') # # Check whether the target attribute is indeed present in the train # DataFrame. if not ch.check_attrs_present(train, target_attr): logger.error('The target attr is not in train table columns') raise AssertionError('The target attr is not in the ' 'train table columns') # # Check whether the exclude attributes are indeed present in the test # DataFrame. if not ch.check_attrs_present(test, exclude_attrs): logger.error('The exclude attrs are not in test table columns') raise AssertionError('The exclude attrs are not in the ' 'test table columns') # The exclude attributes is expected to be of type list, if not # explicitly convert this into a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Drop the duplicates from the exclude attributes exclude_attrs = gh.list_drop_duplicates(exclude_attrs) # If the target attribute is not present in the exclude attributes, # then explicitly add it to the exclude attributes. if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Now, fit using training data matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr) # Get a column name to store the predictions. predict_attr_name = get_name_for_predict_column(test.columns) # Predict using the test data predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, target_attr=predict_attr_name, append=True, inplace=False) # Get the evaluation summary. eval_summary = eval_matches(predicted, target_attr, predict_attr_name) # Get metric in a form that can be displayed from the evaluation summary metric = _get_metric(eval_summary) # Get false negatives and false positives as a DataFrame fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls']) fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls']) em._viewapp = QtWidgets.QApplication.instance() if em._viewapp is None: em._viewapp = QtWidgets.QApplication([]) app = em._viewapp # Get the main window application app = em._viewapp m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted, fp_dataframe, fn_dataframe) # If the show window is true, then display the window. if show_window: m.show() app.exec_()
def test_eval_matches_invalid_predicted_attr(self): eval_matches(pd.DataFrame(), "", None)
def test_eval_matches_invalid_gold_attr(self): eval_matches(pd.DataFrame(), None, "")
def test_eval_matches_invalid_df(self): eval_matches(None, "", "")
def _vis_debug_dt(matcher, train, test, exclude_attrs, target_attr, show_window=True): """ Wrapper function for debugging the Random Forest matcher visually. """ try: from PyQt5 import QtWidgets from py_entitymatching.gui.debug_gui_base import MainWindowManager except ImportError: raise ImportError('PyQt5 is not installed. Please install PyQt5 to use ' 'GUI related functions in py_entitymatching.') # Validate the input parameters # # We expect the matcher to be of type DTMatcher if not isinstance(matcher, DTMatcher): logger.error('Input matcher is not of type Decision Tree matcher') raise AssertionError('Input matcher is not of type ' 'Decision Tree matcher') # # We expect the target attribute to be of type string. validate_object_type(target_attr, six.string_types, error_prefix='Target attribute') # # Check whether the exclude attributes are indeed present in the train # DataFrame. if not ch.check_attrs_present(train, exclude_attrs): logger.error('The exclude attrs are not in train table columns') raise AssertionError('The exclude attrs are not in the ' 'train table columns') # # Check whether the target attribute is indeed present in the train # DataFrame. if not ch.check_attrs_present(train, target_attr): logger.error('The target attr is not in train table columns') raise AssertionError('The target attr is not in the ' 'train table columns') # # Check whether the exclude attributes are indeed present in the test # DataFrame. if not ch.check_attrs_present(test, exclude_attrs): logger.error('The exclude attrs are not in test table columns') raise AssertionError('The exclude attrs are not in the ' 'test table columns') # The exclude attributes is expected to be of type list, if not # explicitly convert this into a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Drop the duplicates from the exclude attributes exclude_attrs = gh.list_drop_duplicates(exclude_attrs) # If the target attribute is not present in the exclude attributes, # then explicitly add it to the exclude attributes. if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Now, fit using training data matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr) # Get a column name to store the predictions. predict_attr_name = get_name_for_predict_column(test.columns) # Predict using the test data predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, target_attr=predict_attr_name, append=True, inplace=False) # Get the evaluation summary. eval_summary = eval_matches(predicted, target_attr, predict_attr_name) # Get metric in a form that can be displayed from the evaluation summary metric = _get_metric(eval_summary) # Get false negatives and false positives as a DataFrame fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls']) fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls']) em._viewapp = QtWidgets.QApplication.instance() if em._viewapp is None: em._viewapp = QtWidgets.QApplication([]) app = em._viewapp # Get the main window application app = em._viewapp m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted, fp_dataframe, fn_dataframe) # If the show window is true, then display the window. if show_window: m.show() app.exec_()