def vis_debug_rf(matcher, train, test, exclude_attrs, target_attr):
    """
    Visual debugger for random forest matcher

    Parameters
    ----------
    matcher : object, RFMatcher object
    train : MTable, containing training data with "True" labels
    test : MTable, containing test data with "True labels.
            The "True" labels are used for evaluation.
    exclude_attrs : List, attributes to be excluded from train and test,
        for training and testing.

    target_attr : String, column name in validation_set containing 'True' labels

    """
    assert set(test.columns) == set(train.columns), "The train and test columns are not same"
    assert set(train.columns).intersection(exclude_attrs) == set(exclude_attrs), "Some of exclude attrs are not part of" \
                                                                                 "train columns"
    # redundant
    assert set(test.columns).intersection(exclude_attrs) == set(exclude_attrs), "Some of exclude attrs are not part of" \
                                                                                "test columns"
    # fit using training data
    matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr)
    predict_attr_name = get_name_for_predict_column(test.columns)
    predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, target_attr=predict_attr_name, append=True,
                                inplace=False)
    eval_summary = mg.eval_matches(predicted, target_attr, predict_attr_name)
    metric = get_metric(eval_summary)
    fp_dataframe = get_dataframe(predicted, eval_summary['false_pos_ls'])
    fn_dataframe = get_dataframe(predicted, eval_summary['false_neg_ls'])
    app = mg._viewapp
    m = MainWindowManager(matcher, "rf", exclude_attrs, metric, predicted, fp_dataframe, fn_dataframe)
    m.show()
    app.exec_()
def _vis_debug_dt(matcher, train, test, exclude_attrs, target_attr, show_window=True):
    if not isinstance(matcher, DTMatcher):
        logger.error('Input matcher is not of type Decision Tree matcher')
        raise AssertionError('Input matcher is not of type Decision Tree matcher')

    if not isinstance(target_attr, six.string_types):
        logger.error('Target attribute is not of type string')
        raise AssertionError('Target attribute is not of type string')

    if not check_attrs_present(train, exclude_attrs):
        logger.error('The exclude attrs are not in train table columns')
        raise AssertionError('The exclude attrs are not in the train table columns')

    if not check_attrs_present(train, target_attr):
        logger.error('The target attr is not in train table columns')
        raise AssertionError('The target attr is not in the train table columns')

    if not check_attrs_present(test, exclude_attrs):
        logger.error('The exclude attrs are not in test table columns')
        raise AssertionError('The exclude attrs are not in the test table columns')

    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    exclude_attrs = list_drop_duplicates(exclude_attrs)

    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # fit using training data
    matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr)

    predict_attr_name = get_name_for_predict_column(test.columns)

    # predict using the test data
    predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs,
                                target_attr=predict_attr_name, append=True,
                                inplace=False)

    eval_summary = mg.eval_matches(predicted, target_attr, predict_attr_name)

    metric = get_metric(eval_summary)
    fp_dataframe = get_dataframe(predicted, eval_summary['false_pos_ls'])

    fn_dataframe = get_dataframe(predicted, eval_summary['false_neg_ls'])
    app = mg._viewapp
    m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted, fp_dataframe,
                          fn_dataframe)
    if show_window == True:
        m.show()
        app.exec_()
Example #3
0
def vis_debug_dt(matcher, train, test, exclude_attrs, target_attr):
    """
    Visual debugger for decision tree matcher

    Parameters
    ----------
    matcher : object, DTMatcher object
    train : MTable, containing training data with "True" labels
    test : MTable, containing test data with "True labels.
            The "True" labels are used for evaluation.
    exclude_attrs : List, attributes to be excluded from train and test,
        for training and testing.

    target_attr : String, column name in validation_set containing 'True' labels

    """

    assert set(test.columns) == set(
        train.columns), "The train and test columns are not same"
    assert set(train.columns).intersection(exclude_attrs) == set(exclude_attrs), "Some of exclude attrs are not part of" \
                                                                                 "train columns"
    # redundant
    assert set(test.columns).intersection(exclude_attrs) == set(exclude_attrs), "Some of exclude attrs are not part of" \
                                                                                "test columns"
    # fit using training data
    matcher.fit(table=train,
                exclude_attrs=exclude_attrs,
                target_attr=target_attr)
    predict_attr_name = get_name_for_predict_column(test.columns)
    predicted = matcher.predict(table=test,
                                exclude_attrs=exclude_attrs,
                                target_attr=predict_attr_name,
                                append=True,
                                inplace=False)
    # print predicted
    eval_summary = mg.eval_matches(predicted, target_attr, predict_attr_name)
    # print eval_summary
    metric = get_metric(eval_summary)
    fp_dataframe = get_dataframe(predicted, eval_summary['false_pos_ls'])
    # print fp_dataframe.dtypes
    fn_dataframe = get_dataframe(predicted, eval_summary['false_neg_ls'])
    app = mg._viewapp
    m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted,
                          fp_dataframe, fn_dataframe)
    m.show()
    app.exec_()
def vis_debug_rm(matcher, validation_set, target_attr, feature_table):
    """
    Visual debugger for boolean rule-based matcher

    Parameters
    ----------
    matcher : object, Boolean rule-based matcher object
    validation_set : MTable, used to debug
    target_attr : String, column name in validation_set containing 'True' labels
    feature_table : pandas dataframe, containing feature information

    """
    predict_attr_name = get_name_for_predict_column(validation_set.columns)
    predicted = matcher.predict(validation_set, predict_attr_name, append=True, inplace=False)
    eval_summary = mg.eval_matches(predicted, target_attr, predict_attr_name)
    metric = get_metric(eval_summary)
    fp_dataframe = get_dataframe(predicted, eval_summary['false_pos_ls'])
    fn_dataframe = get_dataframe(predicted, eval_summary['false_neg_ls'])
    app = mg._viewapp
    m = MainWindowManager(matcher, "rm", feature_table, metric, predicted, fp_dataframe, fn_dataframe)
    m.show()
    app.exec_()
Example #5
0
import sys
import magellan as mg
from magellan.gui.debug_gui_base import vis_debug_dt

sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/')
mg.init_jvm()

A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name'])
L = mg.read_csv('label_demo.csv', ltable=A, rtable=B)
feat_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold')
S = mg.train_test_split(G, 8, 7)
dt = mg.DTMatcher(name='DecisionTree')
dt.fit(table = S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold')
dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted',
           append=True)
d = mg.eval_matches(S['test'], 'gold', 'predicted')

vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table)
print "Hi"
Example #6
0
def _vis_debug_dt(matcher,
                  train,
                  test,
                  exclude_attrs,
                  target_attr,
                  show_window=True):
    if not isinstance(matcher, DTMatcher):
        logger.error('Input matcher is not of type Decision Tree matcher')
        raise AssertionError(
            'Input matcher is not of type Decision Tree matcher')

    if not isinstance(target_attr, six.string_types):
        logger.error('Target attribute is not of type string')
        raise AssertionError('Target attribute is not of type string')

    if not check_attrs_present(train, exclude_attrs):
        logger.error('The exclude attrs are not in train table columns')
        raise AssertionError(
            'The exclude attrs are not in the train table columns')

    if not check_attrs_present(train, target_attr):
        logger.error('The target attr is not in train table columns')
        raise AssertionError(
            'The target attr is not in the train table columns')

    if not check_attrs_present(test, exclude_attrs):
        logger.error('The exclude attrs are not in test table columns')
        raise AssertionError(
            'The exclude attrs are not in the test table columns')

    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    exclude_attrs = list_drop_duplicates(exclude_attrs)

    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # fit using training data
    matcher.fit(table=train,
                exclude_attrs=exclude_attrs,
                target_attr=target_attr)

    predict_attr_name = get_name_for_predict_column(test.columns)

    # predict using the test data
    predicted = matcher.predict(table=test,
                                exclude_attrs=exclude_attrs,
                                target_attr=predict_attr_name,
                                append=True,
                                inplace=False)

    eval_summary = mg.eval_matches(predicted, target_attr, predict_attr_name)

    metric = get_metric(eval_summary)
    fp_dataframe = get_dataframe(predicted, eval_summary['false_pos_ls'])

    fn_dataframe = get_dataframe(predicted, eval_summary['false_neg_ls'])
    app = mg._viewapp
    m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted,
                          fp_dataframe, fn_dataframe)
    if show_window == True:
        m.show()
        app.exec_()
Example #7
0
import sys
import magellan as mg
from magellan.gui.debug_gui_base import vis_debug_dt

sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/')
mg.init_jvm()

A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name'])
L = mg.read_csv('label_demo.csv', ltable=A, rtable=B)
feat_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold')
S = mg.train_test_split(G, 8, 7)
dt = mg.DTMatcher(name='DecisionTree')
dt.fit(table=S['train'],
       exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
       target_attr='gold')
dt.predict(table=S['test'],
           exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
           target_attr='predicted',
           append=True)
d = mg.eval_matches(S['test'], 'gold', 'predicted')

vis_debug_dt(dt,
             d,
             S['test'],
             exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
             feat_table=feat_table)
print "Hi"
Example #8
0
#rf.fit(table=train, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], target_attr='gold')
#mg.vis_debug_rf(rf, train, test, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
#                                      target_attr='gold')

# ret_val, node_list = mg.vis_tuple_debug_rf_matcher(rf, G.iloc[[0]],
#                                                    exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'])

# print ret_val
# print node_list

rm = mg.BooleanRuleMatcher()
rm.add_rule(['title_title_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.6', 'lang_lang_lev(ltuple, rtuple) > 0.5'], feature_table=f)
rm.add_rule(['author_author_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.6'], feature_table=f)
rm.add_rule(['binding_binding_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.5'], feature_table=f)
X = rm.predict(table=L, target_attr='predicted', append=True, inplace=False)
metric = mg.eval_matches(X, 'gold', 'predicted')
print metric

# ret_val, node_list = mg.vis_tuple_debug_rm_matcher(rm, wal.iloc[0], bwk.iloc[1], feature_table=f)
# print ret_val
# print node_list

#mg.vis_debug_rm(rm, L, 'gold', feature_table=f)
trigger = mg.MatchTrigger()
trigger.add_cond_rule('title_title_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.8', f)
trigger.add_cond_status(True)
trigger.add_action(0)
# print P[['_id', 'gold', 'predicted']]
Y = trigger.execute(X, 'predicted', inplace=False)
print Y
print 'Hi'