Esempio n. 1
0
    def test_eval_matches_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'predicted', 'gold')
        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 14)
        self.assertEqual(result['prec_denominator'], 14)
        self.assertAlmostEqual(result['precision'], 1)
        self.assertEqual(result['recall_numerator'], 14)
        self.assertEqual(result['recall_denominator'], 15)
        self.assertEqual(result['recall'], 0.9333333333333333)
        self.assertEqual(result['f1'], 0.9655172413793104)
        self.assertEqual(result['pred_pos_num'], 14)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 1)
        self.assertEqual(result['false_neg_num'], 1.0)
        self.assertEqual(len(result['false_neg_ls']), 1)
        t = result['false_neg_ls'][0]
        self.assertEqual(t[0], 'a1')
        self.assertEqual(t[1], 'b1')
Esempio n. 2
0
    def test_eval_matches_valid_3(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = len(C1)
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        # gold.extend([1]*num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        D = pd.DataFrame(columns=C1.columns)
        cm.copy_properties(C, D)
        result = eval_matches(D, 'gold', 'predicted')

        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 0)
        self.assertEqual(result['prec_denominator'], 0)
        self.assertAlmostEqual(result['precision'], 0)
        self.assertEqual(result['recall_numerator'], 0)
        self.assertEqual(result['recall_denominator'], 0)
        self.assertEqual(result['recall'], 0)
        self.assertEqual(result['f1'], 0)
        self.assertEqual(result['pred_pos_num'], 0)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 0)
        self.assertEqual(result['false_neg_num'], 0.0)
        self.assertEqual(len(result['false_neg_ls']), 0)
    def test_eval_matches_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'predicted', 'gold')
        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 14)
        self.assertEqual(result['prec_denominator'], 14)
        self.assertAlmostEqual(result['precision'], 1)
        self.assertEqual(result['recall_numerator'], 14)
        self.assertEqual(result['recall_denominator'], 15)
        self.assertEqual(result['recall'], 0.9333333333333333)
        self.assertEqual(result['f1'], 0.9655172413793104)
        self.assertEqual(result['pred_pos_num'], 14)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 1)
        self.assertEqual(result['false_neg_num'], 1.0)
        self.assertEqual(len(result['false_neg_ls']), 1)
        t = result['false_neg_ls'][0]
        self.assertEqual(t[0], 'a1')
        self.assertEqual(t[1], 'b1')
    def test_eval_matches_valid_3(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = len(C1)
        num_zeros = len(C1) - num_ones
        gold = [0]*num_ones
        # gold.extend([1]*num_zeros)
        predicted = [1]* (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln+1, 'predicted', predicted)
        D = pd.DataFrame(columns=C1.columns)
        cm.copy_properties(C, D)
        result = eval_matches(D, 'gold', 'predicted')

        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 0)
        self.assertEqual(result['prec_denominator'], 0)
        self.assertAlmostEqual(result['precision'], 0)
        self.assertEqual(result['recall_numerator'], 0)
        self.assertEqual(result['recall_denominator'], 0)
        self.assertEqual(result['recall'], 0)
        self.assertEqual(result['f1'], 0)
        self.assertEqual(result['pred_pos_num'], 0)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 0)
        self.assertEqual(result['false_neg_num'], 0.0)
        self.assertEqual(len(result['false_neg_ls']), 0)
Esempio n. 5
0
    def test_eval_matches_predicted_attr_not_in_df(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'gold', 'predicted1')
    def test_eval_matches_predicted_attr_not_in_df(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'gold', 'predicted1')
def _vis_debug_dt(matcher,
                  train,
                  test,
                  exclude_attrs,
                  target_attr,
                  show_window=True):
    """
    Wrapper function for debugging the Random Forest matcher visually.
    """

    try:
        from PyQt5 import QtWidgets
        from py_entitymatching.gui.debug_gui_base import MainWindowManager
    except ImportError:
        raise ImportError(
            'PyQt5 is not installed. Please install PyQt5 to use '
            'GUI related functions in py_entitymatching.')

    # Validate the input parameters
    # # We expect the matcher to be of type DTMatcher
    if not isinstance(matcher, DTMatcher):
        logger.error('Input matcher is not of type Decision Tree matcher')
        raise AssertionError('Input matcher is not of type '
                             'Decision Tree matcher')

    # # We expect the target attribute to be of type string.
    validate_object_type(target_attr,
                         six.string_types,
                         error_prefix='Target attribute')

    # # Check whether the exclude attributes are indeed present in the train
    #  DataFrame.
    if not ch.check_attrs_present(train, exclude_attrs):
        logger.error('The exclude attrs are not in train table columns')
        raise AssertionError('The exclude attrs are not in the '
                             'train table columns')

    # # Check whether the target attribute is indeed present in the train
    #  DataFrame.
    if not ch.check_attrs_present(train, target_attr):
        logger.error('The target attr is not in train table columns')
        raise AssertionError('The target attr is not in the '
                             'train table columns')

    # # Check whether the exclude attributes are indeed present in the test
    #  DataFrame.
    if not ch.check_attrs_present(test, exclude_attrs):
        logger.error('The exclude attrs are not in test table columns')
        raise AssertionError('The exclude attrs are not in the '
                             'test table columns')

    # The exclude attributes is expected to be of type list, if not
    # explicitly convert this into a list.
    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    # Drop the duplicates from the exclude attributes
    exclude_attrs = gh.list_drop_duplicates(exclude_attrs)

    # If the target attribute is not present in the exclude attributes,
    # then explicitly add it to the exclude attributes.
    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # Now, fit using training data
    matcher.fit(table=train,
                exclude_attrs=exclude_attrs,
                target_attr=target_attr)

    # Get a column name to store the predictions.
    predict_attr_name = get_name_for_predict_column(test.columns)

    # Predict using the test data
    predicted = matcher.predict(table=test,
                                exclude_attrs=exclude_attrs,
                                target_attr=predict_attr_name,
                                append=True,
                                inplace=False)

    # Get the evaluation summary.
    eval_summary = eval_matches(predicted, target_attr, predict_attr_name)

    # Get metric in a form that can be displayed from the evaluation summary
    metric = _get_metric(eval_summary)

    # Get false negatives and false positives as a DataFrame
    fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls'])
    fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls'])

    em._viewapp = QtWidgets.QApplication.instance()

    if em._viewapp is None:
        em._viewapp = QtWidgets.QApplication([])
    app = em._viewapp

    # Get the main window application
    app = em._viewapp
    m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted,
                          fp_dataframe, fn_dataframe)
    # If the show window is true, then display the window.
    if show_window:
        m.show()
        app.exec_()
Esempio n. 8
0
 def test_eval_matches_invalid_predicted_attr(self):
     eval_matches(pd.DataFrame(), "", None)
Esempio n. 9
0
 def test_eval_matches_invalid_gold_attr(self):
     eval_matches(pd.DataFrame(), None, "")
Esempio n. 10
0
 def test_eval_matches_invalid_df(self):
     eval_matches(None, "", "")
 def test_eval_matches_invalid_predicted_attr(self):
     eval_matches(pd.DataFrame(), "", None)
 def test_eval_matches_invalid_gold_attr(self):
     eval_matches(pd.DataFrame(), None, "")
 def test_eval_matches_invalid_df(self):
     eval_matches(None, "", "")
def _vis_debug_dt(matcher, train, test, exclude_attrs, target_attr,
                  show_window=True):
    """
    Wrapper function for debugging the Random Forest matcher visually.
    """

    try:
        from PyQt5 import QtWidgets
        from py_entitymatching.gui.debug_gui_base import MainWindowManager
    except ImportError:
        raise ImportError('PyQt5 is not installed. Please install PyQt5 to use '
                          'GUI related functions in py_entitymatching.')

    # Validate the input parameters
    # # We expect the matcher to be of type DTMatcher
    if not isinstance(matcher, DTMatcher):
        logger.error('Input matcher is not of type Decision Tree matcher')
        raise AssertionError('Input matcher is not of type '
                             'Decision Tree matcher')

    # # We expect the target attribute to be of type string.
    validate_object_type(target_attr, six.string_types, error_prefix='Target attribute')

    # # Check whether the exclude attributes are indeed present in the train
    #  DataFrame.
    if not ch.check_attrs_present(train, exclude_attrs):
        logger.error('The exclude attrs are not in train table columns')
        raise AssertionError('The exclude attrs are not in the '
                             'train table columns')

    # # Check whether the target attribute is indeed present in the train
    #  DataFrame.
    if not ch.check_attrs_present(train, target_attr):
        logger.error('The target attr is not in train table columns')
        raise AssertionError('The target attr is not in the '
                             'train table columns')

    # # Check whether the exclude attributes are indeed present in the test
    #  DataFrame.
    if not ch.check_attrs_present(test, exclude_attrs):
        logger.error('The exclude attrs are not in test table columns')
        raise AssertionError('The exclude attrs are not in the '
                             'test table columns')

    # The exclude attributes is expected to be of type list, if not
    # explicitly convert this into a list.
    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    # Drop the duplicates from the exclude attributes
    exclude_attrs = gh.list_drop_duplicates(exclude_attrs)

    # If the target attribute is not present in the exclude attributes,
    # then explicitly add it to the exclude attributes.
    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # Now, fit using training data
    matcher.fit(table=train, exclude_attrs=exclude_attrs,
                target_attr=target_attr)

    # Get a column name to store the predictions.
    predict_attr_name = get_name_for_predict_column(test.columns)

    # Predict using the test data
    predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs,
                                target_attr=predict_attr_name, append=True,
                                inplace=False)

    # Get the evaluation summary.
    eval_summary = eval_matches(predicted, target_attr, predict_attr_name)

    # Get metric in a form that can be displayed from the evaluation summary
    metric = _get_metric(eval_summary)

    # Get false negatives and false positives as a DataFrame
    fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls'])
    fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls'])

    em._viewapp = QtWidgets.QApplication.instance()

    if em._viewapp is None:
        em._viewapp = QtWidgets.QApplication([])
    app = em._viewapp

    # Get the main window application
    app = em._viewapp
    m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted,
                          fp_dataframe,
                          fn_dataframe)
    # If the show window is true, then display the window.
    if show_window:
        m.show()
        app.exec_()