Example #1
0
def create_feature_vectors(C, l, r, columns):

    l = l.loc[:, columns]
    r = r.loc[:, columns]

    C['id'] = C.index
    l['a.index'] = l.index
    r['b.index'] = r.index

    setup_keys(C, l, r)

    atypes_l = em.get_attr_types(l)
    atypes_r = em.get_attr_types(r)

    for c in columns:
        if atypes_l[c] != atypes_r[c]:  # how to do this more gracefully?
            atypes_r[c] = 'str_bt_5w_10w'
            atypes_l[c] = 'str_bt_5w_10w'

    corres = em.get_attr_corres(l, r)

    tok = em.get_tokenizers_for_blocking()
    sim = em.get_sim_funs_for_blocking()

    feature_table = em.get_features(l, r, atypes_l, atypes_r, corres, tok, sim)

    # Generate features
    X = get_feature_vectors(C, feature_table, attrs_before=['matching'])

    return X
Example #2
0
                     B,
                     'title',
                     'title',
                     word_level=True,
                     overlap_size=3,
                     l_output_attrs=['title', 'author'],
                     r_output_attrs=['title', 'author'],
                     show_progress=False)
C2 = ob.block_candset(C1,
                      'author',
                      'author',
                      word_level=True,
                      overlap_size=1,
                      show_progress=False)

qgm_3 = em.get_tokenizers_for_blocking()['qgm_3']
jaccard = em.get_sim_funs_for_blocking()['jaccard']


def bbRule(ltuple, rtuple):
    l_title = ltuple['title'].split()
    r_title = rtuple['title'].split()
    if len(l_title) < 3 and (len(r_title) == len(l_title)):
        for i in range(len(l_title)):
            if (l_title[i] != r_title[i]):
                return True
            if (jaccard(qgm_3(ltuple['author']), qgm_3(rtuple['author'])) <
                    0.5):
                return True
        return False
    else:
Example #3
0
print len(candidate_pairs)

candidate_pairs = ob.block_candset(candidate_pairs,
                                   'artist',
                                   'artist',
                                   word_level=True,
                                   overlap_size=1,
                                   show_progress=True)

print len(candidate_pairs)

#em.to_csv_metadata(reduced_pairs,'C:/Users/Daniel/Documents/UW/838/Project/Stage3/data/pairs_after_ob_title_and_artist.csv')

block_f = em.get_features_for_blocking(songs, tracks)
block_c = em.get_attr_corres(songs, tracks)
block_t = em.get_tokenizers_for_blocking()
block_s = em.get_sim_funs_for_blocking()

atypes1 = em.get_attr_types(songs)
atypes2 = em.get_attr_types(tracks)

block_f = em.get_features(songs, tracks, atypes1, atypes2, block_c, block_t,
                          block_s)

rb = em.RuleBasedBlocker()
rb.add_rule(["name_name_jac_dlm_dc0_dlm_dc0(ltuple, rtuple) < 0.3"], block_f)

candidate_pairs = rb.block_candset(candidate_pairs, show_progress=True)

print len(candidate_pairs)