Esempio n. 1
0
    def test_eval_matches_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'predicted', 'gold')
        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 14)
        self.assertEqual(result['prec_denominator'], 14)
        self.assertAlmostEqual(result['precision'], 1)
        self.assertEqual(result['recall_numerator'], 14)
        self.assertEqual(result['recall_denominator'], 15)
        self.assertEqual(result['recall'], 0.9333333333333333)
        self.assertEqual(result['f1'], 0.9655172413793104)
        self.assertEqual(result['pred_pos_num'], 14)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 1)
        self.assertEqual(result['false_neg_num'], 1.0)
        self.assertEqual(len(result['false_neg_ls']), 1)
        t = result['false_neg_ls'][0]
        self.assertEqual(t[0], 'a1')
        self.assertEqual(t[1], 'b1')
Esempio n. 2
0
    def test_eval_matches_valid_3(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = len(C1)
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        # gold.extend([1]*num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        D = pd.DataFrame(columns=C1.columns)
        cm.copy_properties(C, D)
        result = eval_matches(D, 'gold', 'predicted')

        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 0)
        self.assertEqual(result['prec_denominator'], 0)
        self.assertAlmostEqual(result['precision'], 0)
        self.assertEqual(result['recall_numerator'], 0)
        self.assertEqual(result['recall_denominator'], 0)
        self.assertEqual(result['recall'], 0)
        self.assertEqual(result['f1'], 0)
        self.assertEqual(result['pred_pos_num'], 0)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 0)
        self.assertEqual(result['false_neg_num'], 0.0)
        self.assertEqual(len(result['false_neg_ls']), 0)
Esempio n. 3
0
    def test_eval_matches_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'predicted', 'gold')
        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 14)
        self.assertEqual(result['prec_denominator'], 14)
        self.assertAlmostEqual(result['precision'], 1)
        self.assertEqual(result['recall_numerator'], 14)
        self.assertEqual(result['recall_denominator'], 15)
        self.assertEqual(result['recall'], 0.9333333333333333)
        self.assertEqual(result['f1'], 0.9655172413793104)
        self.assertEqual(result['pred_pos_num'], 14)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 1)
        self.assertEqual(result['false_neg_num'], 1.0)
        self.assertEqual(len(result['false_neg_ls']), 1)
        t = result['false_neg_ls'][0]
        self.assertEqual(t[0], 'a1')
        self.assertEqual(t[1], 'b1')
Esempio n. 4
0
    def test_eval_matches_valid_3(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = len(C1)
        num_zeros = len(C1) - num_ones
        gold = [0]*num_ones
        # gold.extend([1]*num_zeros)
        predicted = [1]* (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln+1, 'predicted', predicted)
        D = pd.DataFrame(columns=C1.columns)
        cm.copy_properties(C, D)
        result = eval_matches(D, 'gold', 'predicted')

        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 0)
        self.assertEqual(result['prec_denominator'], 0)
        self.assertAlmostEqual(result['precision'], 0)
        self.assertEqual(result['recall_numerator'], 0)
        self.assertEqual(result['recall_denominator'], 0)
        self.assertEqual(result['recall'], 0)
        self.assertEqual(result['f1'], 0)
        self.assertEqual(result['pred_pos_num'], 0)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 0)
        self.assertEqual(result['false_neg_num'], 0.0)
        self.assertEqual(len(result['false_neg_ls']), 0)
Esempio n. 5
0
    def test_eval_matches_predicted_attr_not_in_df(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'gold', 'predicted1')
Esempio n. 6
0
    def test_eval_matches_predicted_attr_not_in_df(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'gold', 'predicted1')
def cv_matcher_and_trigger(matcher,
                           triggers,
                           table,
                           exclude_attrs,
                           target_attr,
                           k=5,
                           metric=None,
                           random_state=None):
    """
    Cross validate matcher and trigger.

    Parameters
    ----------
    matcher : object, An ML-object in Magellan
    triggers : List of MatchTrigger objects
    table : MTable, on which match + trigger should be done
    exclude_attrs : List of string, attribute names that should be excluded from training and evaluation
    target_attr : String, attribute name containing labels in the 'table'
    k : integer, specifies the number of folds for cross-validation. The default value is 5.
    metric : List of strings. Currently, the following values are allowed: 'precision', 'recall', 'f1',
        The list should form a subset of ['precision', 'recall', 'f1']. The default value is set to None.
        If None, then all the three metrics are computed for each fold and returned back to the user.
    random_state: int,Pseudo-random number generator state used for random sampling.
        If None, use default numpy RNG for shuffling
    :return:
    """

    metric = validate_and_get_metric_as_list(metric)

    folds = KFold(len(table), k, shuffle=True, random_state=random_state)
    table = table.copy()
    if isinstance(triggers, list) == False:
        triggers = [triggers]
    eval_ls = []
    ltable = table.get_property('ltable')
    rtable = table.get_property('rtable')
    foreign_key_ltable = table.get_property('foreign_key_ltable')
    foreign_key_rtable = table.get_property('foreign_key_rtable')
    if mg._progbar:
        bar = pyprind.ProgBar(k)
    for train_ind, test_ind in folds:
        train = mg.create_mtable(table.iloc[train_ind],
                                 key=table.get_key(),
                                 ltable=ltable,
                                 rtable=rtable,
                                 foreign_key_ltable=foreign_key_ltable,
                                 foreign_key_rtable=foreign_key_rtable)
        test = mg.create_mtable(table.iloc[test_ind],
                                key=table.get_key(),
                                ltable=ltable,
                                rtable=rtable,
                                foreign_key_ltable=foreign_key_ltable,
                                foreign_key_rtable=foreign_key_rtable)
        if isinstance(matcher, BooleanRuleMatcher) == True:
            pred_col = get_name_for_predict_column(table.columns)
            predicted = matcher.predict(table=test,
                                        append=True,
                                        target_attr=pred_col,
                                        inplace=False)
        else:
            matcher.clf = clone(matcher.clf)
            matcher.fit(table=train,
                        exclude_attrs=exclude_attrs,
                        target_attr=target_attr)
            pred_col = get_name_for_predict_column(table.columns)
            predicted = matcher.predict(table=test,
                                        exclude_attrs=exclude_attrs,
                                        append=True,
                                        target_attr=pred_col,
                                        inplace=False)

        for t in triggers:
            t.execute(predicted, pred_col, inplace=True)

        eval_summary = eval_matches(predicted, target_attr, pred_col)
        eval_ls.append(eval_summary)
        if mg._progbar:
            bar.update()

    header = ['Metric', 'Num folds']
    fold_header = ['Fold ' + str(i + 1) for i in range(k)]
    header.extend(fold_header)
    header.append('Mean score')
    dict_list = []

    for m in metric:
        d = get_metric_dict(eval_ls, k, m, header)
        dict_list.append(d)
    stats = pd.DataFrame(dict_list)
    stats = stats[header]
    res = OrderedDict()
    res['cv_stats'] = stats
    res['fold_stats'] = eval_ls
    return res
Esempio n. 8
0
 def test_eval_matches_invalid_predicted_attr(self):
     eval_matches(pd.DataFrame(), "", None)
Esempio n. 9
0
 def test_eval_matches_invalid_gold_attr(self):
     eval_matches(pd.DataFrame(), None, "")
Esempio n. 10
0
 def test_eval_matches_invalid_df(self):
     eval_matches(None, "", "")
Esempio n. 11
0
 def test_eval_matches_invalid_predicted_attr(self):
     eval_matches(pd.DataFrame(), "", None)
Esempio n. 12
0
 def test_eval_matches_invalid_gold_attr(self):
     eval_matches(pd.DataFrame(), None, "")
Esempio n. 13
0
 def test_eval_matches_invalid_df(self):
     eval_matches(None, "", "")
def cv_matcher_and_trigger(matcher, triggers, table, exclude_attrs,
                           target_attr, k=5, metric=None, random_state=None):

    """
    Cross validate matcher and trigger.

    Parameters
    ----------
    matcher : object, An ML-object in Magellan
    triggers : List of MatchTrigger objects
    table : MTable, on which match + trigger should be done
    exclude_attrs : List of string, attribute names that should be excluded from training and evaluation
    target_attr : String, attribute name containing labels in the 'table'
    k : integer, specifies the number of folds for cross-validation. The default value is 5.
    metric : List of strings. Currently, the following values are allowed: 'precision', 'recall', 'f1',
        The list should form a subset of ['precision', 'recall', 'f1']. The default value is set to None.
        If None, then all the three metrics are computed for each fold and returned back to the user.
    random_state: int,Pseudo-random number generator state used for random sampling.
        If None, use default numpy RNG for shuffling
    :return:
    """

    metric = validate_and_get_metric_as_list(metric)

    folds = KFold(len(table), k, shuffle=True, random_state=random_state)
    table = table.copy()
    if isinstance(triggers, list) == False:
        triggers = [triggers]
    eval_ls = []
    ltable=table.get_property('ltable')
    rtable=table.get_property('rtable')
    foreign_key_ltable=table.get_property('foreign_key_ltable')
    foreign_key_rtable=table.get_property('foreign_key_rtable')
    if mg._progbar:
        bar = pyprind.ProgBar(k)
    for train_ind, test_ind in folds:
        train = mg.create_mtable(table.iloc[train_ind], key=table.get_key(),
                                     ltable=ltable,rtable=rtable,
                                     foreign_key_ltable=foreign_key_ltable,
                                     foreign_key_rtable=foreign_key_rtable)
        test = mg.create_mtable(table.iloc[test_ind], key=table.get_key(),
                                     ltable=ltable,rtable=rtable,
                                     foreign_key_ltable=foreign_key_ltable,
                                     foreign_key_rtable=foreign_key_rtable)
        if isinstance(matcher, BooleanRuleMatcher) == True:
            pred_col = get_name_for_predict_column(table.columns)
            predicted = matcher.predict(table=test, append=True, target_attr=pred_col,
                                        inplace=False)
        else:
            matcher.clf = clone(matcher.clf)
            matcher.fit(table=train, exclude_attrs=exclude_attrs,target_attr=target_attr)
            pred_col = get_name_for_predict_column(table.columns)
            predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs,
                                        append=True, target_attr=pred_col, inplace=False)

        for t in triggers:
            t.execute(predicted, pred_col, inplace=True)

        eval_summary = eval_matches(predicted, target_attr, pred_col)
        eval_ls.append(eval_summary)
        if mg._progbar:
            bar.update()

    header = ['Metric', 'Num folds']
    fold_header = ['Fold ' + str(i+1) for i in range(k)]
    header.extend(fold_header)
    header.append('Mean score')
    dict_list = []

    for m in metric:
        d = get_metric_dict(eval_ls, k, m, header)
        dict_list.append(d)
    stats = pd.DataFrame(dict_list)
    stats = stats[header]
    res = OrderedDict()
    res['cv_stats'] = stats
    res['fold_stats'] = eval_ls
    return res