Example #1
0
A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name'])
mg.init_jvm()
#L = mg.label_table(C, 'gold')
#L.to_csv('label.csv')
L = mg.read_csv('label.csv', ltable=A, rtable=B)
feature_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=feature_table, attrs_after='gold')
m = mg.LinRegMatcher()
t = cv_matcher_and_trigger(
    m,
    None,
    table=G,
    exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
    target_attr='gold',
    k=5,
    metric='precision',
    random_state=0)

res = mg.select_matcher(
    [m],
    table=G,
    exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
    target_attr='gold',
    k=5,
    metric='f1',
    random_state=0)
print res['cv_stats']
Example #2
0
G = mg.impute_table(G, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], strategy='most_frequent')
# m = mg.LinRegMatcher()
# print G
pos_trigger = mg.MatchTrigger()
pos_trigger.add_cond_rule('author_author_lev(ltuple, rtuple) == 1',
                          feature_table=feature_table)
pos_trigger.add_cond_status(True)
pos_trigger.add_action(1)

neg_trigger = mg.MatchTrigger()
neg_trigger.add_cond_rule(['lang_lang_lev(ltuple, rtuple) > 0.5'],
    feature_table=feature_table)
neg_trigger.add_cond_status(False)
neg_trigger.add_action(0)

t = cv_matcher_and_trigger(m, [pos_trigger],  table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
                           target_attr='gold', k=5, metric=['f1'], random_state=1)
# Create rule-based matcher and add rules.
# rm = mg.BooleanRuleMatcher()
# rm.add_rule(['title_title_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.6'
#              ],
#             feature_table=feature_table)
# rm.add_rule(['author_author_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.6'],
#             feature_table=f)
# rm.add_rule(['binding_binding_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.5'],
#             feature_table=f)
#
# X = rm.predict(table=L, target_attr='predicted',
#                append=True, inplace=False)
# eval_summary = mg.eval_matches(X, 'gold', 'predicted')

# q = cv_matcher_and_trigger(rm, [neg_trigger],  table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
Example #3
0
pos_trigger = mg.MatchTrigger()
pos_trigger.add_cond_rule('author_author_lev(ltuple, rtuple) == 1',
                          feature_table=feature_table)
pos_trigger.add_cond_status(True)
pos_trigger.add_action(1)

neg_trigger = mg.MatchTrigger()
neg_trigger.add_cond_rule(['lang_lang_lev(ltuple, rtuple) > 0.5'],
                          feature_table=feature_table)
neg_trigger.add_cond_status(False)
neg_trigger.add_action(0)

t = cv_matcher_and_trigger(
    m, [pos_trigger],
    table=G,
    exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
    target_attr='gold',
    k=5,
    metric=['f1'],
    random_state=1)
# Create rule-based matcher and add rules.
# rm = mg.BooleanRuleMatcher()
# rm.add_rule(['title_title_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.6'
#              ],
#             feature_table=feature_table)
# rm.add_rule(['author_author_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.6'],
#             feature_table=f)
# rm.add_rule(['binding_binding_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.5'],
#             feature_table=f)
#
# X = rm.predict(table=L, target_attr='predicted',
#                append=True, inplace=False)