def create_feature_vectors(C, l, r, columns): l = l.loc[:, columns] r = r.loc[:, columns] C['id'] = C.index l['a.index'] = l.index r['b.index'] = r.index setup_keys(C, l, r) atypes_l = em.get_attr_types(l) atypes_r = em.get_attr_types(r) for c in columns: if atypes_l[c] != atypes_r[c]: # how to do this more gracefully? atypes_r[c] = 'str_bt_5w_10w' atypes_l[c] = 'str_bt_5w_10w' corres = em.get_attr_corres(l, r) tok = em.get_tokenizers_for_blocking() sim = em.get_sim_funs_for_blocking() feature_table = em.get_features(l, r, atypes_l, atypes_r, corres, tok, sim) # Generate features X = get_feature_vectors(C, feature_table, attrs_before=['matching']) return X
def _get_field_correspondence_list(ltable, rtable, lkey, rkey, attr_corres): corres_list = [] if attr_corres is None or len(attr_corres) == 0: corres_list = mg.get_attr_corres(ltable, rtable)['corres'] if len(corres_list) == 0: raise AssertionError('Error: the field correspondence list' ' is empty. Please specify the field' ' correspondence!') else: for tu in attr_corres: corres_list.append(tu) key_pair = (lkey, rkey) if key_pair not in corres_list: corres_list.append(key_pair) return corres_list
def _get_field_correspondence_list(ltable, rtable, lkey, rkey, attr_corres): corres_list = [] if attr_corres is None or len(attr_corres) == 0: corres_list = em.get_attr_corres(ltable, rtable)['corres'] if len(corres_list) == 0: raise AssertionError('Error: the field correspondence list' ' is empty. Please specify the field' ' correspondence!') else: for tu in attr_corres: corres_list.append(tu) # If the key correspondence is not in the list, add it in. key_pair = (lkey, rkey) if key_pair not in corres_list: corres_list.append(key_pair) return corres_list
def extract_features(ltable_df, rtable_df, candset_df): tokenizers = em.get_tokenizers_for_matching() sim_functions = em.get_sim_funs_for_matching() left_attr_types = em.get_attr_types(ltable_df) right_attr_types = em.get_attr_types(rtable_df) correspondences = em.get_attr_corres(ltable_df, rtable_df) feature_dict_list = [] attribute_type_rank = {'boolean':1, 'numeric':2, 'str_eq_1w':3, 'str_bt_1w_5w':4, 'str_bt_5w_10w':5, 'str_gt_10w':6, 'un_determined':7} for c in correspondences['corres']: if left_attr_types[c[0]] != right_attr_types[c[1]]: if attribute_type_rank[left_attr_types[c[0]]] < attribute_type_rank[right_attr_types[c[1]]]: left_attr_types[c[0]] = right_attr_types[c[1]] else: right_attr_types[c[1]] = left_attr_types[c[0]] feature_records = get_features(ltable_df,rtable_df,left_attr_types, right_attr_types, correspondences, tokenizers, sim_functions) #Remove all features based on id - they are often useless feature_records = feature_records[feature_records.left_attribute !='id'] feature_records.reset_index(inplace=True,drop=True) distance_functions = ["lev_dist", "rdf"] non_normalized_functions = ["aff", "sw", "swn", "nmw"] keep_features = [True]*feature_records.shape[0] for i in range(feature_records.shape[0]): feature = feature_records.loc[i,"feature_name"] for func in distance_functions + non_normalized_functions: if func in feature: keep_features[i] = False feature_records = feature_records.loc[keep_features,:] print("\n\nExtracting the full set of features:") candset_features_df = em.extract_feature_vecs(candset_df,feature_table=feature_records,attrs_after='gold',show_progress=True,n_jobs=-1) candset_features_df.fillna(value=0, inplace=True) return candset_features_df
eval_set = train_test['test'] em.to_csv_metadata(dev_set, 'datasets/dev_set.csv') em.to_csv_metadata(eval_set, 'datasets/eval_set.csv') # myset = em.split_train_test(dev_set, train_proportion=0.9) # I_set = myset['train'] # J_set = myset['test'] # em.to_csv_metadata(I_set, 'datasets/I_set.csv') # em.to_csv_metadata(J_set, 'datasets/J_set.csv') # creating feature for matching match_t = em.get_tokenizers_for_matching() match_s = em.get_sim_funs_for_matching() atypes1 = em.get_attr_types(sampled_movies) atypes2 = em.get_attr_types(sampled_tracks) match_c = em.get_attr_corres(sampled_movies, sampled_tracks) match_f = em.get_features(sampled_movies, sampled_tracks, atypes1, atypes2, match_c, match_t, match_s) # generating feature vectors H = em.extract_feature_vecs(dev_set, feature_table=match_f, attrs_after='label', show_progress=False) # filling missing values in feature vectors H.fillna(value=0, inplace=True) # creating a set of learning-based matchers dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0)
def run_magellan(train_set, valid_set, test_set, feature_combinations, classifiers, experiment_name, write_test_set_for_inspection=False): train_path = os.path.dirname(train_set) train_file = os.path.basename(train_set) test_path = os.path.dirname(test_set) test_file = os.path.basename(test_set) report_train_name = train_file.replace('.csv', '') report_test_name = test_file.replace('.csv', '') train_set_left = train_file.replace('pairs', 'left') train_set_right = train_file.replace('pairs', 'right') test_set_left = test_file.replace('pairs', 'left') test_set_right = test_file.replace('pairs', 'right') os.makedirs(os.path.dirname( '../../../reports/magellan/{}/'.format(experiment_name)), exist_ok=True) try: os.remove('../../../reports/magellan/{}/{}_{}.csv'.format( experiment_name, report_train_name, report_test_name)) except OSError: pass with open( '../../../reports/magellan/{}/{}_{}.csv'.format( experiment_name, report_train_name, report_test_name), "w") as f: f.write( 'feature#####model#####mean_train_score#####std_train_score#####mean_valid_score#####std_valid_score#####precision_test#####recall_test#####f1_test#####best_params#####train_time#####prediction_time#####feature_importance#####experiment_name#####train_set#####test_set\n' ) for run in range(1, 4): for feature_combination in feature_combinations: A_t = em.read_csv_metadata(train_path + '/' + train_set_left, key='mag_id') B_t = em.read_csv_metadata(train_path + '/' + train_set_right, key='mag_id') # Load the pre-labeled data S_t = em.read_csv_metadata(train_set, key='_id', ltable=A_t, rtable=B_t, fk_ltable='ltable_mag_id', fk_rtable='rtable_mag_id') A_gs = em.read_csv_metadata(test_path + '/' + test_set_left, key='mag_id') B_gs = em.read_csv_metadata(test_path + '/' + test_set_right, key='mag_id') # Load the pre-labeled data S_gs = em.read_csv_metadata(test_set, key='_id', ltable=A_gs, rtable=B_gs, fk_ltable='ltable_mag_id', fk_rtable='rtable_mag_id') A_t.fillna('', inplace=True) A_gs.fillna('', inplace=True) B_t.fillna('', inplace=True) B_gs.fillna('', inplace=True) S_t.fillna('', inplace=True) S_gs.fillna('', inplace=True) ## DIRTY FIX, CLEAN UP! if 'name' in A_t.columns: A_t["price"] = A_t["price"].replace(r'^\s*$', np.nan, regex=True) A_t["price"] = A_t["price"].astype('float64') A_gs["price"] = A_gs["price"].replace(r'^\s*$', np.nan, regex=True) A_gs["price"] = A_gs["price"].astype('float64') B_t["price"] = B_t["price"].replace(r'^\s*$', np.nan, regex=True) B_t["price"] = B_t["price"].astype('float64') B_gs["price"] = B_gs["price"].replace(r'^\s*$', np.nan, regex=True) B_gs["price"] = B_gs["price"].astype('float64') S_t["ltable_price"] = S_t["ltable_price"].replace(r'^\s*$', np.nan, regex=True) S_t["ltable_price"] = S_t["ltable_price"].astype('float64') S_t["rtable_price"] = S_t["rtable_price"].replace(r'^\s*$', np.nan, regex=True) S_t["rtable_price"] = S_t["rtable_price"].astype('float64') S_gs["ltable_price"] = S_gs["ltable_price"].replace(r'^\s*$', np.nan, regex=True) S_gs["ltable_price"] = S_gs["ltable_price"].astype('float64') S_gs["rtable_price"] = S_gs["rtable_price"].replace(r'^\s*$', np.nan, regex=True) S_gs["rtable_price"] = S_gs["rtable_price"].astype('float64') atypes1 = em.get_attr_types(A_t) atypes2 = em.get_attr_types(B_t) match_c = em.get_attr_corres(A_t, B_t) match_c['corres'] = [] # select attributes to compare for feature in feature_combination: match_c['corres'].append((feature, feature)) tok = em.get_tokenizers_for_matching() sim = em.get_sim_funs_for_matching() F_t = em.get_features(A_t, B_t, atypes1, atypes2, match_c, tok, sim) H_t = em.extract_feature_vecs(S_t, feature_table=F_t, attrs_after=['label', 'pair_id'], show_progress=False) H_gs = em.extract_feature_vecs(S_gs, feature_table=F_t, attrs_after='label', show_progress=False) H_t = H_t.fillna(-1) H_gs = H_gs.fillna(-1) validation_ids_df = pd.read_csv(valid_set) val_df = H_t[H_t['pair_id'].isin( validation_ids_df['pair_id'].values)] train_only_df = H_t[~H_t['pair_id']. isin(validation_ids_df['pair_id'].values)] train_only_df = train_only_df.drop(columns='pair_id') val_df = val_df.drop(columns='pair_id') train_only_df = train_only_df.sample(frac=1, random_state=42) pos_neg = H_t['label'].value_counts() pos_neg = round(pos_neg[0] / pos_neg[1]) train_ind = [] val_ind = [] for i in range(len(train_only_df) - 1): train_ind.append(-1) for i in range(len(val_df) - 1): val_ind.append(0) ps = PredefinedSplit(test_fold=np.concatenate((train_ind, val_ind))) train_df = pd.concat([train_only_df, val_df]) for k, v in classifiers.items(): classifier = v['clf'] if 'random_state' in classifier.get_params().keys(): classifier = classifier.set_params(**{'random_state': run}) # add pos_neg ratio to XGBoost params if k == 'XGBoost': v['params']['scale_pos_weight']: [1, pos_neg] model = RandomizedSearchCV(cv=ps, estimator=classifier, param_distributions=v['params'], random_state=42, n_jobs=4, scoring='f1', n_iter=500, pre_dispatch=8, return_train_score=True) feats_train = train_df.drop( ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1) labels_train = train_df['label'] feats_gs = H_gs.drop( ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1) labels_gs = H_gs['label'] try: model.fit(feats_train, labels_train) except ValueError: set_trace() parameters = model.best_params_ score_names = [ 'mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score' ] scores = {} score_string = '' for name in score_names: scores[name] = model.cv_results_[name][model.best_index_] score_string = score_string + name + ': ' + str( scores[name]) + ' ' feature_names = list(feats_train.columns) if k == 'LogisticRegression' or k == 'LinearSVC': most_important_features = model.best_estimator_.coef_ word_importance = zip(feature_names, most_important_features[0].tolist()) word_importance = sorted( word_importance, key=lambda importance: importance[1], reverse=True) if k == 'RandomForest' or k == 'DecisionTree': most_important_features = model.best_estimator_.feature_importances_ word_importance = zip(feature_names, most_important_features.tolist()) word_importance = sorted( word_importance, key=lambda importance: importance[1], reverse=True) if k == 'NaiveBayes': word_importance = '' if k == 'XGBoost': most_important_features = model.best_estimator_.feature_importances_ word_importance = zip(feature_names, most_important_features.tolist()) word_importance = sorted( word_importance, key=lambda importance: importance[1], reverse=True) if k == 'LogisticRegression': learner = LogisticRegression(random_state=run, solver='liblinear', **parameters) elif k == 'NaiveBayes': learner = GaussianNB() elif k == 'DecisionTree': learner = DecisionTreeClassifier(random_state=run, **parameters) elif k == 'LinearSVC': learner = LinearSVC(random_state=run, dual=False, **parameters) elif k == 'RandomForest': learner = RandomForestClassifier(random_state=run, n_jobs=4, **parameters) elif k == 'XGBoost': learner = xgb.XGBClassifier(random_state=run, n_jobs=4, **parameters) else: print('Learner is not a valid option') break model = learner feats_train = train_only_df.sample(frac=1, random_state=42) feats_train = train_only_df.drop( ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1) labels_train = train_only_df['label'] start = time.time() model.fit(feats_train, labels_train) end = time.time() train_time = end - start start = time.time() preds_gs = model.predict(feats_gs) end = time.time() pred_time = end - start gs_report = classification_report(labels_gs, preds_gs, output_dict=True) feature_report = '+'.join(feature_combination) if write_test_set_for_inspection: out_path = '../../../data/processed/wdc-lspc/inspection/{}/magellan/'.format( experiment_name) os.makedirs(os.path.dirname(out_path), exist_ok=True) file_name = '_'.join([ os.path.basename(train_set), os.path.basename(test_set), k, feature_report ]) file_name = file_name.replace('.csv', '') file_name += f'_{run}.pkl.gz' test_inspection_df = S_gs.copy() if k == 'LinearSVC': proba_gs = model.decision_function(feats_gs).tolist() else: proba_gs = model.predict_proba(feats_gs).tolist() test_inspection_df['pred'] = preds_gs test_inspection_df['Class Prob'] = proba_gs test_inspection_df.to_pickle(out_path + file_name, compression='gzip') with open( '../../../reports/magellan/{}/{}_{}.csv'.format( experiment_name, report_train_name, report_test_name), "a") as f: f.write(feature_report + '#####' + k + '#####' + str(scores['mean_train_score']) + '#####' + str(scores['std_train_score']) + '#####' + str(scores['mean_test_score']) + '#####' + str(scores['std_test_score']) + '#####' + str(gs_report['1']['precision']) + '#####' + str(gs_report['1']['recall']) + '#####' + str(gs_report['1']['f1-score']) + '#####' + str(parameters) + '#####' + str(train_time) + '#####' + str(pred_time) + '#####' + str(word_importance[0:100]) + '#####' + experiment_name + '#####' + report_train_name + '#####' + report_test_name + '\n')
print len(candidate_pairs) candidate_pairs = ob.block_candset(candidate_pairs, 'artist', 'artist', word_level=True, overlap_size=1, show_progress=True) print len(candidate_pairs) #em.to_csv_metadata(reduced_pairs,'C:/Users/Daniel/Documents/UW/838/Project/Stage3/data/pairs_after_ob_title_and_artist.csv') block_f = em.get_features_for_blocking(songs, tracks) block_c = em.get_attr_corres(songs, tracks) block_t = em.get_tokenizers_for_blocking() block_s = em.get_sim_funs_for_blocking() atypes1 = em.get_attr_types(songs) atypes2 = em.get_attr_types(tracks) block_f = em.get_features(songs, tracks, atypes1, atypes2, block_c, block_t, block_s) rb = em.RuleBasedBlocker() rb.add_rule(["name_name_jac_dlm_dc0_dlm_dc0(ltuple, rtuple) < 0.3"], block_f) candidate_pairs = rb.block_candset(candidate_pairs, show_progress=True) print len(candidate_pairs)
dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') nb = em.NBMatcher(name='Naive Bayes') #**********************Creating Features ******************************************************* #************* Change feature datatype***************** a_types = em.get_attr_types(A) b_types = em.get_attr_types(B) b_types['Name'] = a_types['Name'] match_c = em.get_attr_corres(A, B) match_t = em.get_tokenizers_for_blocking() match_s = em.get_sim_funs_for_blocking() F = em.get_features( A, B, a_types, b_types, match_c, match_t, match_s, ) #*********************** Drop Attributes: ISBN_10 and ISBN_13 *****************************************************
G = em.read_csv_metadata("../DATA/Labelled Set G.csv", key="_id", fk_ltable="ltable_ID", fk_rtable="rtable_ID", ltable=A, rtable=B) # Split G into I an J IJ = em.split_train_test(G, train_proportion=0.66, random_state=29) I = IJ['train'] J = IJ['test'] atypes1 = em.get_attr_types(A) atypes2 = em.get_attr_types(B) block_c = em.get_attr_corres(A, B) block_c['corres'] = [('title', 'title'), ('author', 'author')] tok = em.get_tokenizers_for_blocking() sim = em.get_sim_funs_for_blocking() feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim) # Convert the I into a set of feature vectors using F H = em.extract_feature_vecs(I, feature_table=feature_table, attrs_after='label', show_progress=False) # Impute feature vectors with the mean of the column values. H = em.impute_table(H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],
train_data = split['train'] test_data = split['test'] dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') nb = em.NBMatcher(name='NaiveBayes') # Before we can apply any machine learning technique, we need to extract a set of features. Fortunately, the **py_entitymatching** package can automatically extract a set of features once we specify which columns in the two datasets correspond to each other. The following code snippet starts by specifying the correspondence between the column of the two datasets. Then, it uses the **py_entitymatching** package to determine the type of each column. By considering the types of columns in each dataset (stored in variables *l_attr_types* and *r_attr_types*), and using the tokenizers and similarity functions suggested by the package, we can extract a set of instructions for extracting features. Note that variable **F** is not the set of extracted features, rather it encodes the instructions for computing the features. # In[27]: attr_corres = em.get_attr_corres(kaggle_data, imdb_data) attr_corres['corres'] = [('norm_movie_title', 'norm_title'), ('norm_title_year', 'norm_year'), ('content_rating', 'mpaa'), ('budget', 'budget'), ] l_attr_types = em.get_attr_types(kaggle_data) r_attr_types = em.get_attr_types(imdb_data) tok = em.get_tokenizers_for_matching() sim = em.get_sim_funs_for_matching() F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim)