Beispiel #1
0
def create_feature_vectors(C, l, r, columns):

    l = l.loc[:, columns]
    r = r.loc[:, columns]

    C['id'] = C.index
    l['a.index'] = l.index
    r['b.index'] = r.index

    setup_keys(C, l, r)

    atypes_l = em.get_attr_types(l)
    atypes_r = em.get_attr_types(r)

    for c in columns:
        if atypes_l[c] != atypes_r[c]:  # how to do this more gracefully?
            atypes_r[c] = 'str_bt_5w_10w'
            atypes_l[c] = 'str_bt_5w_10w'

    corres = em.get_attr_corres(l, r)

    tok = em.get_tokenizers_for_blocking()
    sim = em.get_sim_funs_for_blocking()

    feature_table = em.get_features(l, r, atypes_l, atypes_r, corres, tok, sim)

    # Generate features
    X = get_feature_vectors(C, feature_table, attrs_before=['matching'])

    return X
def _get_field_correspondence_list(ltable, rtable, lkey, rkey, attr_corres):
    corres_list = []
    if attr_corres is None or len(attr_corres) == 0:
        corres_list = mg.get_attr_corres(ltable, rtable)['corres']
        if len(corres_list) == 0:
            raise AssertionError('Error: the field correspondence list'
                                 ' is empty. Please specify the field'
                                 ' correspondence!')
    else:
        for tu in attr_corres:
            corres_list.append(tu)

    key_pair = (lkey, rkey)
    if key_pair not in corres_list:
        corres_list.append(key_pair)

    return corres_list
def _get_field_correspondence_list(ltable, rtable, lkey, rkey, attr_corres):
    corres_list = []
    if attr_corres is None or len(attr_corres) == 0:
        corres_list = em.get_attr_corres(ltable, rtable)['corres']
        if len(corres_list) == 0:
            raise AssertionError('Error: the field correspondence list'
                                 ' is empty. Please specify the field'
                                 ' correspondence!')
    else:
        for tu in attr_corres:
            corres_list.append(tu)

    # If the key correspondence is not in the list, add it in.
    key_pair = (lkey, rkey)
    if key_pair not in corres_list:
        corres_list.append(key_pair)

    return corres_list
Beispiel #4
0
def extract_features(ltable_df, rtable_df, candset_df):
    tokenizers = em.get_tokenizers_for_matching()
    sim_functions = em.get_sim_funs_for_matching()
    left_attr_types = em.get_attr_types(ltable_df)
    right_attr_types = em.get_attr_types(rtable_df)
    correspondences = em.get_attr_corres(ltable_df, rtable_df)

    feature_dict_list = []
    attribute_type_rank = {'boolean':1, 'numeric':2, 'str_eq_1w':3, 'str_bt_1w_5w':4, 'str_bt_5w_10w':5, 'str_gt_10w':6, 'un_determined':7}
    for c in correspondences['corres']:
        if left_attr_types[c[0]] != right_attr_types[c[1]]:
            if attribute_type_rank[left_attr_types[c[0]]] < attribute_type_rank[right_attr_types[c[1]]]:
                left_attr_types[c[0]] = right_attr_types[c[1]]
            else:
                right_attr_types[c[1]] = left_attr_types[c[0]]

    feature_records = get_features(ltable_df,rtable_df,left_attr_types, right_attr_types, correspondences, tokenizers, sim_functions)
    #Remove all features based on id - they are often useless
    feature_records = feature_records[feature_records.left_attribute !='id']
    feature_records.reset_index(inplace=True,drop=True)

    distance_functions = ["lev_dist", "rdf"]
    non_normalized_functions = ["aff", "sw", "swn", "nmw"]
    keep_features = [True]*feature_records.shape[0]
    for i in range(feature_records.shape[0]):
        feature = feature_records.loc[i,"feature_name"]
        for func in distance_functions + non_normalized_functions:
            if func in feature:
                keep_features[i] = False
    feature_records = feature_records.loc[keep_features,:]

    print("\n\nExtracting the full set of features:")
    candset_features_df = em.extract_feature_vecs(candset_df,feature_table=feature_records,attrs_after='gold',show_progress=True,n_jobs=-1)
    candset_features_df.fillna(value=0, inplace=True)

    return candset_features_df
Beispiel #5
0
eval_set = train_test['test']
em.to_csv_metadata(dev_set, 'datasets/dev_set.csv')
em.to_csv_metadata(eval_set, 'datasets/eval_set.csv')

# myset = em.split_train_test(dev_set, train_proportion=0.9)
# I_set = myset['train']
# J_set = myset['test']
# em.to_csv_metadata(I_set, 'datasets/I_set.csv')
# em.to_csv_metadata(J_set, 'datasets/J_set.csv')

# creating feature for matching
match_t = em.get_tokenizers_for_matching()
match_s = em.get_sim_funs_for_matching()
atypes1 = em.get_attr_types(sampled_movies)
atypes2 = em.get_attr_types(sampled_tracks)
match_c = em.get_attr_corres(sampled_movies, sampled_tracks)
match_f = em.get_features(sampled_movies, sampled_tracks, atypes1, atypes2,
                          match_c, match_t, match_s)

# generating feature vectors
H = em.extract_feature_vecs(dev_set,
                            feature_table=match_f,
                            attrs_after='label',
                            show_progress=False)

# filling missing values in feature vectors
H.fillna(value=0, inplace=True)

# creating a set of learning-based matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
def run_magellan(train_set,
                 valid_set,
                 test_set,
                 feature_combinations,
                 classifiers,
                 experiment_name,
                 write_test_set_for_inspection=False):
    train_path = os.path.dirname(train_set)
    train_file = os.path.basename(train_set)
    test_path = os.path.dirname(test_set)
    test_file = os.path.basename(test_set)
    report_train_name = train_file.replace('.csv', '')
    report_test_name = test_file.replace('.csv', '')

    train_set_left = train_file.replace('pairs', 'left')
    train_set_right = train_file.replace('pairs', 'right')

    test_set_left = test_file.replace('pairs', 'left')
    test_set_right = test_file.replace('pairs', 'right')

    os.makedirs(os.path.dirname(
        '../../../reports/magellan/{}/'.format(experiment_name)),
                exist_ok=True)

    try:
        os.remove('../../../reports/magellan/{}/{}_{}.csv'.format(
            experiment_name, report_train_name, report_test_name))
    except OSError:
        pass

    with open(
            '../../../reports/magellan/{}/{}_{}.csv'.format(
                experiment_name, report_train_name, report_test_name),
            "w") as f:
        f.write(
            'feature#####model#####mean_train_score#####std_train_score#####mean_valid_score#####std_valid_score#####precision_test#####recall_test#####f1_test#####best_params#####train_time#####prediction_time#####feature_importance#####experiment_name#####train_set#####test_set\n'
        )

    for run in range(1, 4):
        for feature_combination in feature_combinations:

            A_t = em.read_csv_metadata(train_path + '/' + train_set_left,
                                       key='mag_id')
            B_t = em.read_csv_metadata(train_path + '/' + train_set_right,
                                       key='mag_id')
            # Load the pre-labeled data
            S_t = em.read_csv_metadata(train_set,
                                       key='_id',
                                       ltable=A_t,
                                       rtable=B_t,
                                       fk_ltable='ltable_mag_id',
                                       fk_rtable='rtable_mag_id')

            A_gs = em.read_csv_metadata(test_path + '/' + test_set_left,
                                        key='mag_id')
            B_gs = em.read_csv_metadata(test_path + '/' + test_set_right,
                                        key='mag_id')
            # Load the pre-labeled data
            S_gs = em.read_csv_metadata(test_set,
                                        key='_id',
                                        ltable=A_gs,
                                        rtable=B_gs,
                                        fk_ltable='ltable_mag_id',
                                        fk_rtable='rtable_mag_id')

            A_t.fillna('', inplace=True)
            A_gs.fillna('', inplace=True)

            B_t.fillna('', inplace=True)
            B_gs.fillna('', inplace=True)

            S_t.fillna('', inplace=True)
            S_gs.fillna('', inplace=True)

            ## DIRTY FIX, CLEAN UP!
            if 'name' in A_t.columns:
                A_t["price"] = A_t["price"].replace(r'^\s*$',
                                                    np.nan,
                                                    regex=True)
                A_t["price"] = A_t["price"].astype('float64')
                A_gs["price"] = A_gs["price"].replace(r'^\s*$',
                                                      np.nan,
                                                      regex=True)
                A_gs["price"] = A_gs["price"].astype('float64')
                B_t["price"] = B_t["price"].replace(r'^\s*$',
                                                    np.nan,
                                                    regex=True)
                B_t["price"] = B_t["price"].astype('float64')
                B_gs["price"] = B_gs["price"].replace(r'^\s*$',
                                                      np.nan,
                                                      regex=True)
                B_gs["price"] = B_gs["price"].astype('float64')

                S_t["ltable_price"] = S_t["ltable_price"].replace(r'^\s*$',
                                                                  np.nan,
                                                                  regex=True)
                S_t["ltable_price"] = S_t["ltable_price"].astype('float64')
                S_t["rtable_price"] = S_t["rtable_price"].replace(r'^\s*$',
                                                                  np.nan,
                                                                  regex=True)
                S_t["rtable_price"] = S_t["rtable_price"].astype('float64')

                S_gs["ltable_price"] = S_gs["ltable_price"].replace(r'^\s*$',
                                                                    np.nan,
                                                                    regex=True)
                S_gs["ltable_price"] = S_gs["ltable_price"].astype('float64')
                S_gs["rtable_price"] = S_gs["rtable_price"].replace(r'^\s*$',
                                                                    np.nan,
                                                                    regex=True)
                S_gs["rtable_price"] = S_gs["rtable_price"].astype('float64')

            atypes1 = em.get_attr_types(A_t)
            atypes2 = em.get_attr_types(B_t)

            match_c = em.get_attr_corres(A_t, B_t)

            match_c['corres'] = []

            # select attributes to compare
            for feature in feature_combination:
                match_c['corres'].append((feature, feature))

            tok = em.get_tokenizers_for_matching()
            sim = em.get_sim_funs_for_matching()

            F_t = em.get_features(A_t, B_t, atypes1, atypes2, match_c, tok,
                                  sim)

            H_t = em.extract_feature_vecs(S_t,
                                          feature_table=F_t,
                                          attrs_after=['label', 'pair_id'],
                                          show_progress=False)
            H_gs = em.extract_feature_vecs(S_gs,
                                           feature_table=F_t,
                                           attrs_after='label',
                                           show_progress=False)

            H_t = H_t.fillna(-1)
            H_gs = H_gs.fillna(-1)

            validation_ids_df = pd.read_csv(valid_set)
            val_df = H_t[H_t['pair_id'].isin(
                validation_ids_df['pair_id'].values)]
            train_only_df = H_t[~H_t['pair_id'].
                                isin(validation_ids_df['pair_id'].values)]

            train_only_df = train_only_df.drop(columns='pair_id')
            val_df = val_df.drop(columns='pair_id')

            train_only_df = train_only_df.sample(frac=1, random_state=42)

            pos_neg = H_t['label'].value_counts()
            pos_neg = round(pos_neg[0] / pos_neg[1])

            train_ind = []
            val_ind = []

            for i in range(len(train_only_df) - 1):
                train_ind.append(-1)

            for i in range(len(val_df) - 1):
                val_ind.append(0)

            ps = PredefinedSplit(test_fold=np.concatenate((train_ind,
                                                           val_ind)))

            train_df = pd.concat([train_only_df, val_df])

            for k, v in classifiers.items():

                classifier = v['clf']
                if 'random_state' in classifier.get_params().keys():
                    classifier = classifier.set_params(**{'random_state': run})

                # add pos_neg ratio to XGBoost params
                if k == 'XGBoost':
                    v['params']['scale_pos_weight']: [1, pos_neg]

                model = RandomizedSearchCV(cv=ps,
                                           estimator=classifier,
                                           param_distributions=v['params'],
                                           random_state=42,
                                           n_jobs=4,
                                           scoring='f1',
                                           n_iter=500,
                                           pre_dispatch=8,
                                           return_train_score=True)

                feats_train = train_df.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_train = train_df['label']
                feats_gs = H_gs.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_gs = H_gs['label']

                try:
                    model.fit(feats_train, labels_train)
                except ValueError:
                    set_trace()

                parameters = model.best_params_

                score_names = [
                    'mean_train_score', 'std_train_score', 'mean_test_score',
                    'std_test_score'
                ]
                scores = {}
                score_string = ''
                for name in score_names:
                    scores[name] = model.cv_results_[name][model.best_index_]
                    score_string = score_string + name + ': ' + str(
                        scores[name]) + ' '

                feature_names = list(feats_train.columns)

                if k == 'LogisticRegression' or k == 'LinearSVC':
                    most_important_features = model.best_estimator_.coef_
                    word_importance = zip(feature_names,
                                          most_important_features[0].tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)
                if k == 'RandomForest' or k == 'DecisionTree':
                    most_important_features = model.best_estimator_.feature_importances_
                    word_importance = zip(feature_names,
                                          most_important_features.tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)
                if k == 'NaiveBayes':
                    word_importance = ''
                if k == 'XGBoost':
                    most_important_features = model.best_estimator_.feature_importances_
                    word_importance = zip(feature_names,
                                          most_important_features.tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)

                if k == 'LogisticRegression':
                    learner = LogisticRegression(random_state=run,
                                                 solver='liblinear',
                                                 **parameters)
                elif k == 'NaiveBayes':
                    learner = GaussianNB()
                elif k == 'DecisionTree':
                    learner = DecisionTreeClassifier(random_state=run,
                                                     **parameters)
                elif k == 'LinearSVC':
                    learner = LinearSVC(random_state=run,
                                        dual=False,
                                        **parameters)
                elif k == 'RandomForest':
                    learner = RandomForestClassifier(random_state=run,
                                                     n_jobs=4,
                                                     **parameters)
                elif k == 'XGBoost':
                    learner = xgb.XGBClassifier(random_state=run,
                                                n_jobs=4,
                                                **parameters)
                else:
                    print('Learner is not a valid option')
                    break

                model = learner
                feats_train = train_only_df.sample(frac=1, random_state=42)
                feats_train = train_only_df.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_train = train_only_df['label']

                start = time.time()
                model.fit(feats_train, labels_train)
                end = time.time()

                train_time = end - start

                start = time.time()
                preds_gs = model.predict(feats_gs)

                end = time.time()

                pred_time = end - start

                gs_report = classification_report(labels_gs,
                                                  preds_gs,
                                                  output_dict=True)

                feature_report = '+'.join(feature_combination)

                if write_test_set_for_inspection:

                    out_path = '../../../data/processed/wdc-lspc/inspection/{}/magellan/'.format(
                        experiment_name)
                    os.makedirs(os.path.dirname(out_path), exist_ok=True)

                    file_name = '_'.join([
                        os.path.basename(train_set),
                        os.path.basename(test_set), k, feature_report
                    ])
                    file_name = file_name.replace('.csv', '')
                    file_name += f'_{run}.pkl.gz'

                    test_inspection_df = S_gs.copy()
                    if k == 'LinearSVC':
                        proba_gs = model.decision_function(feats_gs).tolist()
                    else:
                        proba_gs = model.predict_proba(feats_gs).tolist()
                    test_inspection_df['pred'] = preds_gs
                    test_inspection_df['Class Prob'] = proba_gs
                    test_inspection_df.to_pickle(out_path + file_name,
                                                 compression='gzip')

                with open(
                        '../../../reports/magellan/{}/{}_{}.csv'.format(
                            experiment_name, report_train_name,
                            report_test_name), "a") as f:
                    f.write(feature_report + '#####' + k + '#####' +
                            str(scores['mean_train_score']) + '#####' +
                            str(scores['std_train_score']) + '#####' +
                            str(scores['mean_test_score']) + '#####' +
                            str(scores['std_test_score']) + '#####' +
                            str(gs_report['1']['precision']) + '#####' +
                            str(gs_report['1']['recall']) + '#####' +
                            str(gs_report['1']['f1-score']) + '#####' +
                            str(parameters) + '#####' + str(train_time) +
                            '#####' + str(pred_time) + '#####' +
                            str(word_importance[0:100]) + '#####' +
                            experiment_name + '#####' + report_train_name +
                            '#####' + report_test_name + '\n')
Beispiel #7
0
print len(candidate_pairs)

candidate_pairs = ob.block_candset(candidate_pairs,
                                   'artist',
                                   'artist',
                                   word_level=True,
                                   overlap_size=1,
                                   show_progress=True)

print len(candidate_pairs)

#em.to_csv_metadata(reduced_pairs,'C:/Users/Daniel/Documents/UW/838/Project/Stage3/data/pairs_after_ob_title_and_artist.csv')

block_f = em.get_features_for_blocking(songs, tracks)
block_c = em.get_attr_corres(songs, tracks)
block_t = em.get_tokenizers_for_blocking()
block_s = em.get_sim_funs_for_blocking()

atypes1 = em.get_attr_types(songs)
atypes2 = em.get_attr_types(tracks)

block_f = em.get_features(songs, tracks, atypes1, atypes2, block_c, block_t,
                          block_s)

rb = em.RuleBasedBlocker()
rb.add_rule(["name_name_jac_dlm_dc0_dlm_dc0(ltuple, rtuple) < 0.3"], block_f)

candidate_pairs = rb.block_candset(candidate_pairs, show_progress=True)

print len(candidate_pairs)
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='Naive Bayes')

#**********************Creating Features *******************************************************

#************* Change feature datatype*****************
a_types = em.get_attr_types(A)
b_types = em.get_attr_types(B)
b_types['Name'] = a_types['Name']

match_c = em.get_attr_corres(A, B)
match_t = em.get_tokenizers_for_blocking()
match_s = em.get_sim_funs_for_blocking()

F = em.get_features(
    A,
    B,
    a_types,
    b_types,
    match_c,
    match_t,
    match_s,
)

#***********************          Drop Attributes: ISBN_10 and ISBN_13          *****************************************************
Beispiel #9
0
G = em.read_csv_metadata("../DATA/Labelled Set G.csv",
                         key="_id",
                         fk_ltable="ltable_ID",
                         fk_rtable="rtable_ID",
                         ltable=A,
                         rtable=B)

# Split G into I an J
IJ = em.split_train_test(G, train_proportion=0.66, random_state=29)
I = IJ['train']
J = IJ['test']

atypes1 = em.get_attr_types(A)
atypes2 = em.get_attr_types(B)

block_c = em.get_attr_corres(A, B)
block_c['corres'] = [('title', 'title'), ('author', 'author')]

tok = em.get_tokenizers_for_blocking()
sim = em.get_sim_funs_for_blocking()

feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim)

# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I,
                            feature_table=feature_table,
                            attrs_after='label',
                            show_progress=False)
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H,
                    exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],
Beispiel #10
0
train_data = split['train']
test_data = split['test']

dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')


# Before we can apply any machine learning technique, we need to extract a set of features. Fortunately, the **py_entitymatching** package can automatically extract a set of features once we specify which columns in the two datasets correspond to each other. The following code snippet starts by specifying the correspondence between the column of the two datasets. Then, it uses the **py_entitymatching** package to determine the type of each column. By considering the types of columns in each dataset (stored in variables *l_attr_types* and *r_attr_types*), and using the tokenizers and similarity functions suggested by the package, we can extract a set of instructions for extracting features. Note that variable **F** is not the set of extracted features, rather it encodes the instructions for computing the features.

# In[27]:

attr_corres = em.get_attr_corres(kaggle_data, imdb_data)
attr_corres['corres'] = [('norm_movie_title', 'norm_title'), 
                         ('norm_title_year', 'norm_year'),
                        ('content_rating', 'mpaa'),
                         ('budget', 'budget'),
]

l_attr_types = em.get_attr_types(kaggle_data)
r_attr_types = em.get_attr_types(imdb_data)

tok = em.get_tokenizers_for_matching()
sim = em.get_sim_funs_for_matching()

F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim)