def main(): # Read in data files A = em.read_csv_metadata(FOLDER + 'A.csv', key='id') # imdb data B = em.read_csv_metadata(FOLDER + 'B.csv', key='id') # tmdb data G = em.read_csv_metadata(FOLDER + 'G.csv', key='_id', ltable=A, rtable=B, fk_ltable='l_id', fk_rtable='r_id') # labeled data # Split G into I and J for CV IJ = em.split_train_test(G, train_proportion=0.5, random_state=0) I = IJ['train'] J = IJ['test'] # Save I and J to files I.to_csv(FOLDER + 'I.csv', index=False) J.to_csv(FOLDER + 'J.csv', index=False) # Generate features set F F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) #print(F.feature_name) #print(type(F)) # Convert I to a set of feature vectors using F H = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False) #print(H.head) # Check of missing values #print(any(pd.notnull(H))) excluded_attributes = ['_id', 'l_id', 'r_id', 'label'] # Fill in missing values with column's average H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean') # Create a set of matchers dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') nb = em.NBMatcher(name='NaiveBayes') # Selecting best matcher with CV using F1-score as criteria CV_result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=excluded_attributes, k=10, target_attr='label', metric_to_select_matcher='f1', random_state=0) print(CV_result['cv_stats']) # RF is the best matcher # Train matchers on H dt.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') rf.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') svm.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') ln.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') nb.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') # Convert J into a set of features using F L = em.extract_feature_vecs(J, feature_table=F, attrs_after='label', show_progress=False) # Fill in missing values with column's average L = em.impute_table(L, exclude_attrs=excluded_attributes, strategy='mean') # Predict on L with trained matchers predictions_dt = dt.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_rf = rf.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_svm = svm.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_lg = lg.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_ln = ln.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_nb = nb.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') # Evaluate predictions dt_eval = em.eval_matches(predictions_dt, 'label', 'predicted') em.print_eval_summary(dt_eval) rf_eval = em.eval_matches(predictions_rf, 'label', 'predicted') em.print_eval_summary(rf_eval) svm_eval = em.eval_matches(predictions_svm, 'label', 'predicted') em.print_eval_summary(svm_eval) lg_eval = em.eval_matches(predictions_lg, 'label', 'predicted') em.print_eval_summary(lg_eval) ln_eval = em.eval_matches(predictions_ln, 'label', 'predicted') em.print_eval_summary(ln_eval) nb_eval = em.eval_matches(predictions_nb, 'label', 'predicted') em.print_eval_summary(nb_eval)
match_f = em.get_features(sampled_movies, sampled_tracks, atypes1, atypes2, match_c, match_t, match_s) # generating feature vectors H = em.extract_feature_vecs(dev_set, feature_table=match_f, attrs_after='label', show_progress=False) # filling missing values in feature vectors H.fillna(value=0, inplace=True) # creating a set of learning-based matchers dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') nb = em.NBMatcher(name='NaiveBayes') # Selecting the best matcher using cross-validation # precision of matchers for 5-fold cross validations result_p = em.select_matcher( [dt, svm, rf, lg, ln, nb], table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], k=5, target_attr='label', metric='precision', random_state=0)
A = em.read_csv_metadata('../Data/amazon.csv', key='ID'); B = em.read_csv_metadata('../Data/Barnob.csv', key='ID'); G = em.read_csv_metadata('../Data/Label.csv', key='_id', ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID') IJ = em.split_train_test(G, train_proportion=0.6, random_state=0); I = IJ['train']; J = IJ['test']; # Create a set of ML-matchers dt = em.DTMatcher(name='DecisionTree', random_state=0); rf = em.RFMatcher(name='Random Forest', random_state=0); svm = em.SVMMatcher(name='SVM', random_state=0); nb = em.NBMatcher(name='Naive Bayes'); lg = em.LogRegMatcher(name='Logistic Reg', random_state=0); ln = em.LinRegMatcher(name='Linear Reg'); F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False); H = em.extract_feature_vecs(I, feature_table=F, attrs_after='gold_labels', show_progress=False) H = em.impute_table(H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold_labels'], strategy='mean');
def run_magellan_models(sampler, blocking, lsh_args, sequential_args, return_prob_estimates=True): ''' 1. Loads data from processed folder of dataset choice. 2. Performs blocking according to given hyper-parameters 3. For every given blocking set, generate automatic features 4. Run suite of shallow learning algorithms on candidate sets Inputs: sampler: sampling technique that was used to generate data: iterative or naive blocking: blocking algorithm used: iterative or lsh lsh_args = dictionary: seeds, char_ngrams, bands --> dictionary sequential_args: cutoff_distance , min_shared_tokens Outputs: training_pred_dict, validation_pred_dict, test_pred_dict, pre_blocked_all_sets_labels, post_blocked_all_sets_labels ''' if (sampler != "iterative") & (sampler != "naive"): raise ValueError( "Sampler should be iterative or naive (completely random).") # Load Training Set according to sampler em.del_catalog() lhs_table = em.read_csv_metadata( "../data/processed_amazon_google/amz_google_" + sampler + "_X_train_lhs.csv").rename(columns={"Unnamed: 0": "id_lhs"}) rhs_table = em.read_csv_metadata( "../data/processed_amazon_google/amz_google_" + sampler + "_X_train_rhs.csv").rename(columns={"Unnamed: 0": "id_rhs"}) y_train = pd.read_csv("../data/processed_amazon_google/amz_google_" + sampler + "_y_train.csv") em.del_catalog() em.set_key(lhs_table, "id_lhs") em.set_key(rhs_table, "id_rhs") n_train = lhs_table.shape[0] # Blocking blocking_cols = ["title_amzn", "title_g"] feature_cols = [[ 'title_amzn', 'description_amzn', 'manufacturer_amzn', 'price_amzn' ], ['title_g', 'description_g', 'manufacturer_g', 'price_g']] id_names = ["id_amzn", "id_g"] lsh_blocking_col_ids = 1 print("Blocking Train Set") if (blocking == "lsh"): # [1,2] hashes on title and description candidates = lsh_blocking(lhs_table, rhs_table, lsh_blocking_col_ids, 5, ["id_amzn", "id_g"], char_ngram=lsh_args["char_ngram"], seeds=lsh_args["seeds"], bands=lsh_args["bands"]) elif (blocking == "sequential"): # Initial Rough Blocking on Overlapped Attributes candidates = overlapped_attribute_blocking( lhs_table, rhs_table, blocking_cols, sequential_args["min_shared_tokens"], feature_cols, id_names) # Fine Grained Blocking on edit distance candidates = edit_distance_blocking(None, None, blocking_cols, sequential_args["cutoff_distance"], True, candidates) else: raise ValueError("Blocking must be lsh or sequential") print(f"Generated Candidate size has {candidates.shape[0]} rows") # Generate Features id_names_phrase = ["_amzn", "_g"] # Trims away these suffixes from id columns feature_cols = [ [ 'title_amzn', 'description_amzn', # removed manufacturer due to missingess: produces features with nans 'price_amzn' ], ['title_g', 'description_g', 'price_g'] ] generated_df_train = automatic_feature_gen(candidates, feature_cols, id_names, id_names_phrase) generated_df_train = pd.merge(generated_df_train, y_train, left_on=["id_amzn", "id_g"], right_on=["id_amzn", "id_g"], how="left") generated_df_train.y = generated_df_train.y.map({1.0: int(1), np.nan: 0}) # Store Training Column names. Ensures that if by chance a new column is generated in # validation or test phase, these ones will be ignored model_features = generated_df_train.columns # If only one class is present in blocking stage, skip training a matcher as it would be impossible # Essentially label all blocked tuples as being a match train_matchers = True if (len(generated_df_train.y.unique())) <= 1: train_matchers = False print( f"Train Candidate Pairs only consist of one class. Skipping matcher training and setting blocker as a matcher." ) if train_matchers: # Train Models on training set #dt = em.DTMatcher(name='DecisionTree', random_state=0) #svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) xg = em.XGBoostMatcher(name="Xg-Boost", random_state=0) # dt.fit(table = generated_df_train, # exclude_attrs=['index', 'id_amzn','id_g','index_num_lhs', 'index_num_rhs'], # target_attr='y') # svm.fit(table = generated_df_train, # exclude_attrs=['index', 'id_amzn','id_g','index_num_lhs', 'index_num_rhs'], # target_attr='y') rf.fit(table=generated_df_train, exclude_attrs=[ 'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs' ], target_attr='y') lg.fit(table=generated_df_train, exclude_attrs=[ 'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs' ], target_attr='y') xg.fit(table=generated_df_train, exclude_attrs=[ 'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs' ], target_attr='y') models = [rf, lg, xg] training_predictions = {} for model in models: training_predictions[model.name] = model.predict( table=generated_df_train, exclude_attrs=[ 'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs', "y" ], return_probs=return_prob_estimates) # Load validation Set + Generate the feature columns em.del_catalog() lhs_table = em.read_csv_metadata( "../data/processed_amazon_google/amz_google_" + sampler + "_X_valid_lhs.csv").rename(columns={"Unnamed: 0": "id_lhs"}) rhs_table = em.read_csv_metadata( "../data/processed_amazon_google/amz_google_" + sampler + "_X_valid_rhs.csv").rename(columns={"Unnamed: 0": "id_rhs"}) y_valid = pd.read_csv("../data/processed_amazon_google/amz_google_" + sampler + "_y_valid.csv") em.del_catalog() em.set_key(lhs_table, "id_lhs") em.set_key(rhs_table, "id_rhs") n_valid = lhs_table.shape[0] print("Blocking Validation Set") if (blocking == "lsh"): candidates = lsh_blocking(lhs_table, rhs_table, lsh_blocking_col_ids, 5, ["id_amzn", "id_g"], char_ngram=lsh_args["char_ngram"], seeds=lsh_args["seeds"], bands=lsh_args["bands"]) elif (blocking == "sequential"): # Initial Rough Blocking on Overlapped Attributes candidates = overlapped_attribute_blocking( lhs_table, rhs_table, blocking_cols, sequential_args["min_shared_tokens"], feature_cols, id_names) # Fine Grained Blocking on edit distance candidates = edit_distance_blocking(None, None, blocking_cols, sequential_args["cutoff_distance"], True, candidates) else: raise ValueError("Blocking must be lsh or sequential") generated_df_valid = automatic_feature_gen(candidates, feature_cols, id_names, id_names_phrase) generated_df_valid = pd.merge(generated_df_valid, y_valid, left_on=["id_amzn", "id_g"], right_on=["id_amzn", "id_g"], how="left") generated_df_valid.y = generated_df_valid.y.map({1.0: int(1), np.nan: 0}) generated_df_valid = generated_df_valid.loc[:, model_features] ## TODO: think of a better idea!! it is because we enforce all generated data sets to have same columns as training set generated_df_valid = generated_df_valid.fillna(0) if train_matchers: # Predict on Validation Set validation_predictions = {} for model in models: validation_predictions[model.name] = model.predict( table=generated_df_valid, exclude_attrs=[ 'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs', "y" ], return_probs=return_prob_estimates) # Retrain on all data generated_final_train = pd.concat( [generated_df_train, generated_df_valid], axis=0) # dt.fit(table = generated_final_train, # exclude_attrs=['index', 'id_amzn','id_g','index_num_lhs', 'index_num_rhs'], # target_attr='y') # svm.fit(table = generated_final_train, # exclude_attrs=['index', 'id_amzn','id_g','index_num_lhs', 'index_num_rhs'], # target_attr='y') rf.fit(table=generated_final_train, exclude_attrs=[ 'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs' ], target_attr='y') lg.fit(table=generated_final_train, exclude_attrs=[ 'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs' ], target_attr='y') xg.fit(table=generated_final_train, exclude_attrs=[ 'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs' ], target_attr='y') # Finally Generate Test Set Predictions em.del_catalog() lhs_table = em.read_csv_metadata( "../data/processed_amazon_google/amz_google_" + sampler + "_X_test_lhs.csv").rename(columns={"Unnamed: 0": "id_lhs"}) rhs_table = em.read_csv_metadata( "../data/processed_amazon_google/amz_google_" + sampler + "_X_test_rhs.csv").rename(columns={"Unnamed: 0": "id_rhs"}) y_test = pd.read_csv("../data/processed_amazon_google/amz_google_" + sampler + "_y_test.csv") em.del_catalog() em.set_key(lhs_table, "id_lhs") em.set_key(rhs_table, "id_rhs") n_test = lhs_table.shape[0] print("Blocking Test Set") if (blocking == "lsh"): candidates = lsh_blocking(lhs_table, rhs_table, lsh_blocking_col_ids, 5, ["id_amzn", "id_g"], char_ngram=lsh_args["char_ngram"], seeds=lsh_args["seeds"], bands=lsh_args["bands"]) elif (blocking == "sequential"): # Initial Rough Blocking on Overlapped Attributes candidates = overlapped_attribute_blocking( lhs_table, rhs_table, blocking_cols, sequential_args["min_shared_tokens"], feature_cols, id_names) # Fine Grained Blocking on edit distance candidates = edit_distance_blocking(None, None, blocking_cols, sequential_args["cutoff_distance"], True, candidates) else: raise ValueError("Blocking must be lsh or sequential") generated_df_test = automatic_feature_gen(candidates, feature_cols, id_names, id_names_phrase) generated_df_test = pd.merge(generated_df_test, y_test, left_on=["id_amzn", "id_g"], right_on=["id_amzn", "id_g"], how="left") generated_df_test.y = generated_df_test.y.map({1.0: int(1), np.nan: 0}) generated_df_test = generated_df_test.loc[:, model_features] generated_df_test = generated_df_test.fillna(0) if train_matchers: # Predict on test Set test_predictions = {} for model in models: print(model.name) test_predictions[model.name] = model.predict( table=generated_df_test, exclude_attrs=[ 'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs', "y" ], return_probs=return_prob_estimates) # Create pre_blocked_all_sets_labels to store truth of candidate tuples after BLOCKING pre_blocked_all_sets_labels = { "train": y_train, "valid": y_valid, "test": y_test } post_blocked_all_sets_labels = { "train": generated_df_train[["id_amzn", "id_g", "y"]], "valid": generated_df_valid[["id_amzn", "id_g", "y"]], "test": generated_df_test[["id_amzn", "id_g", "y"]] } if (blocking == "lsh"): metadata = lsh_args else: metadata = sequential_args print( "-----------------------------------------------------------------------------" ) print( f"Finished Experiment using {sampler} and {blocking} with params: {metadata} where train_matchers is: {train_matchers}" ) print( "-----------------------------------------------------------------------------" ) # Add in sample sizes metadata["n_train"] = n_train metadata["n_valid"] = n_valid metadata["n_test"] = n_test metadata["sampler"] = sampler metadata["blocking"] = blocking # return matcher predictions if train_matchers occurs otherwise return predictions via the blocker if train_matchers: return (training_predictions, validation_predictions, test_predictions, pre_blocked_all_sets_labels, post_blocked_all_sets_labels, metadata) else: training_predictions, validation_predictions, test_predictions = blocker_as_matcher( n_train, n_valid, n_test) return (training_predictions, validation_predictions, test_predictions, pre_blocked_all_sets_labels, post_blocked_all_sets_labels, metadata)
# Save Set I #em.to_csv_metadata(I, './TableI.csv') # Save Set J #em.to_csv_metadata(J, './TableJ.csv') # Automatic feature generation F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) H = em.extract_feature_vecs(I, feature_table=F, attrs_after=['gold_labels']) # Fill missing values H.fillna(value='NaN', inplace=True) # Create ML matchers dt = em.DTMatcher(name='DecisionTree') svm = em.SVMMatcher(name='SVM') rf = em.RFMatcher(name='RandomForest') lg = em.LogRegMatcher(name='LogisticRegression') ln = em.LinRegMatcher(name='LinearRegression') nb = em.NBMatcher(name='NaiveBayes') # Select the best matcher result = em.select_matcher( [dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold_labels'], k=5, target_attr='gold_labels', metric_to_select_matcher='f1') print(result['cv_stats']) best_matcher = result['selected_matcher'] # Evaluate the matcher
def main(): A = em.read_csv_metadata('ltable.csv', key="ltable_id", encoding='ISO-8859-1') B = em.read_csv_metadata('rtable.csv', key="rtable_id", encoding='ISO-8859-1') ob = em.OverlapBlocker() C = ob.block_tables( A, B, 'title', 'title', l_output_attrs=['title', 'category', 'brand', 'modelno', 'price'], r_output_attrs=['title', 'category', 'brand', 'modelno', 'price'], overlap_size=1, show_progress=False) S = em.sample_table(C, 450) G = em.read_csv_metadata("train.csv", key='id', ltable=A, rtable=B, fk_ltable='ltable_id', fk_rtable='rtable_id') feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) G = em.label_table(S, 'label') attrs_from_table = [ 'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno', 'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand', 'rtable_modelno', 'rtable_price' ] H = em.extract_feature_vecs(G, feature_table=feature_table, attrs_before=attrs_from_table, attrs_after='label', show_progress=False) H.fillna('0', inplace=True) # H = em.impute_table( # H, exclude_attrs=['_id', 'ltable_ltable_id', 'rtable_rtable_id','label'], strategy='mean') rf = em.RFMatcher() attrs_to_be_excluded = [] attrs_to_be_excluded.extend( ['_id', 'ltable_ltable_id', 'rtable_rtable_id', 'label']) attrs_to_be_excluded.extend(attrs_from_table) rf.fit(table=H, exclude_attrs=attrs_to_be_excluded, target_attr='label') attrs_from_table = [ 'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno', 'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand', 'rtable_modelno', 'rtable_price' ] L = em.extract_feature_vecs(C, feature_table=feature_table, attrs_before=attrs_from_table, show_progress=False, n_jobs=-1) attrs_to_be_excluded = [] attrs_to_be_excluded.extend( ['_id', 'ltable_ltable_id', 'rtable_rtable_id']) attrs_to_be_excluded.extend(attrs_from_table) predictions = rf.predict(table=L, exclude_attrs=attrs_to_be_excluded, append=True, target_attr='predicted', inplace=False) dataset = pd.DataFrame({"id": G[0]['id'], 'label': predictions['label']}) dataset.to_csv("./prediction2.csv", index=False)
feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) # Select the attrs. to be included in the feature vector table attrs_from_table = [ 'ltable_name', 'ltable_addr', 'ltable_city', 'ltable_phone', 'rtable_name', 'rtable_addr', 'rtable_city', 'rtable_phone' ] H = em.extract_feature_vecs(G, feature_table=feature_table, attrs_before=attrs_from_table, attrs_after='gold', show_progress=False) rf = em.RFMatcher() attrs_to_be_excluded = [] attrs_to_be_excluded.extend(['_id', 'ltable_id', 'rtable_id', 'gold']) attrs_to_be_excluded.extend(attrs_from_table) rf.fit(table=H, exclude_attrs=attrs_to_be_excluded, target_attr='gold') L = em.extract_feature_vecs(C, feature_table=feature_table, attrs_before=attrs_from_table, show_progress=False, n_jobs=-1) attrs_to_be_excluded = [] attrs_to_be_excluded.extend(['_id', 'ltable_id', 'rtable_id'])
def workflow(path_A, path_B, path_labeled): # Load csv files as dataframes and set the key attribute in the dataframe A = em.read_csv_metadata(path_A, key='ID') B = em.read_csv_metadata(path_B, key='ID') # Run attribute equivalence blocker on brand ab = em.AttrEquivalenceBlocker() C1 = ab.block_tables(A, B, 'Brand', 'Brand', l_output_attrs=[ 'Name', 'Price', 'Brand', 'Screen Size', 'RAM', 'Hard Drive Capacity', 'Processor Type', 'Processor Speed', 'Operating System', 'Clean Name' ], r_output_attrs=[ 'Name', 'Price', 'Brand', 'Screen Size', 'RAM', 'Hard Drive Capacity', 'Processor Type', 'Processor Speed', 'Operating System', 'Clean Name' ]) # Get features for rule based blocking block_f = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False) # Run rule based blocker with rule for jaccard score on Clean Name column rb = em.RuleBasedBlocker() rb.add_rule( ['Clean_Name_Clean_Name_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.2'], block_f) C2 = rb.block_candset(C1) # Run black box blocker to compare screen size, ram, and hard drive capacity bb_screen = em.BlackBoxBlocker() bb_screen.set_black_box_function((screen_ram_hd_equal)) C = bb_screen.block_candset(C2) # Load the labeled data L = em.read_csv_metadata(path_labeled, key='_id', ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID') # Generate features feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) feature_subset = feature_table.iloc[np.r_[4:10, 40:len(feature_table)], :] em.add_blackbox_feature(feature_subset, 'refurbished', refurbished) # Extract feature vectors feature_vectors_dev = em.extract_feature_vecs(L, feature_table=feature_subset, attrs_after='gold') # Impute feature vectors with the mean of the column values. feature_vectors_dev = em.impute_table( feature_vectors_dev, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'], strategy='mean') # Train using feature vectors from the labeled data matcher = em.RFMatcher(name='RF') matcher.fit(table=feature_vectors_dev, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'], target_attr='gold') # Extract feature vectors for the rest of the data feature_vectors = em.extract_feature_vecs(C, feature_table=feature_subset) # Impute feature vectors with the mean of the column values. feature_vectors = em.impute_table( feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], strategy='mean') # Make predictions for the whole data set predictions = matcher.predict( table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], append=True, target_attr='predicted', inplace=False) predictions = predictions.loc[:, [ '_id', 'ltable_ID', 'rtable_ID', 'predicted' ]] return predictions[predictions['predicted'] == 1]
def main(): # WELCOME TO MY MAGELLAN RUN SCRIPT print("\n-------------WELCOME TO MY MAGELLAN RUN SCRIPT-------------\n") # Get the datasets directory datasets_dir = 'B:\McMaster\CAS 764 - Advance Topics in Data Management\Project\Data\\' print("- Dataset directory: " + datasets_dir) print("- List of folders/files: ") print(os.listdir(datasets_dir)) print("- Please enter new dataset folder name:") datasets_dir += input() print("- Dataset directory set to: " + datasets_dir) dateset_dir_files = os.listdir(datasets_dir) print("- List of files in dataset folder: ") print(dateset_dir_files) # Get the path of the input table A print("- Enter an index for Table A file (0-x):") file_index_A = input() filename_A = dateset_dir_files[int(file_index_A)] print("Table A file set to: " + filename_A) # Get the path of the input table path_A = datasets_dir + os.sep + filename_A # Get the path of the input table B print("- Enter an index for Table B file (0-x):") file_index_B = input() filename_B = dateset_dir_files[int(file_index_B)] print("Table B file set to: " + filename_B) # Get the path of the input table path_B = datasets_dir + os.sep + filename_B # Print Table A column names A = em.read_csv_metadata(path_A) print("- List of columns of Table A: ") print(list(A.columns)) # Get the Table A id/primary key column name print('- Enter Table A primary key column index (ex. 0):') pk_A_index = input() pk_A = A.columns[int(pk_A_index)] # Print Table B column names B = em.read_csv_metadata(path_B) print("- List of columns of Table B: ") print(list(B.columns)) # Get the Table B id/primary key column name print('- Enter Table B primary key column index (ex. 0):') pk_B_index = input() pk_B = A.columns[int(pk_A_index)] # READING TABLES AND SETTING METADATA print("\n-------------READING TABLES AND SETTING METADATA-------------\n") # Both read csv and set metadata id as ID column #A = em.read_csv_metadata(path_A, key=pk_A) #B = em.read_csv_metadata(path_B, key=pk_B) em.set_key(A, pk_A) em.set_key(B, pk_B) # Number of tables print('- Number of tuples in A: ' + str(len(A))) print('- Number of tuples in B: ' + str(len(B))) print('- Number of tuples in A X B (i.e the cartesian product): ' + str(len(A) * len(B))) # Print first 5 tuples of tables print(A.head()) print(B.head()) # Display the keys of the input tables print("- Table A primary key: " + em.get_key(A)) print("- Table B primary key: " + em.get_key(B)) # DOWNSAMPLING print("\n-------------DOWNSAMPING-------------\n") print("- Do you want to use downsampling? (y or n):") print("- Table A: " + str(len(A)) + ", Table B: " + str(len(B))) print("- NOTE: Recommended if both tables have 100K+ tuples.") is_downsample = input() if (is_downsample == 'y'): print("- Size of the downsampled tables (ex. 200):") downsample_size = input() # If the tables are large we can downsample the tables like this A1, B1 = em.down_sample(A, B, downsample_size, 1, show_progress=False) print("- Length of Table A1" + len(A1)) print("- Length of Table B1" + len(B1)) # BLOCKING print("\n-------------BLOCKING-------------\n") print("- Do you want to use blocking? (y or n):") is_blocking = input() if (is_blocking == 'y'): # Check if the 2 tables column names are the same if (list(A.columns) == list(B.columns)): C_attr_eq = [] # Attr Equ blocker result list C_overlap = [] # Overlap blocker result list C_blackbox = [] # BlackBox blocker result list # Left and right table attribute prefixes l_prefix = "ltable_" r_prefix = "rtable_" print("\n- List of columns: ") print(list(A.columns)) # Labeling output table column selection print( "\n- Enter the indexes of columns that you want to see in labeling table (0-" + str(len(A.columns) - 1) + "):") out_attr = [] for i in range(1, len(A.columns)): print("- Finish with empty character(enter+enter) " + str(i)) add_to_attr = input() if (add_to_attr == ''): break # Get indexes from user and add columns into out_attr list out_attr.append(A.columns[int(add_to_attr)]) # Print output attributes print(out_attr) # Loop for adding/combining new blockers while (True): # Blocker selection print( "\n- Do yo want to use Attribute Equivalence[ab] (same), Overlap[ob] (similar) or Blackbox[bb] blocker:" ) blocker_selection = input() # ----- Attribute Equivalence Blocker ----- if (blocker_selection == 'ab'): # Create attribute equivalence blocker ab = em.AttrEquivalenceBlocker() # Counter for indexes attr_eq_counter = 0 # Check if Overlap Blocker used before if (C_overlap and not C_overlap[-1].empty): print( "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_attr_eq.append( C_overlap[-1]) # Add last output of ob attr_eq_counter += 1 # For skipping block_table function in first time # Check if BlackBox Blocker used before if (C_blackbox and not C_blackbox[-1].empty): print( "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_attr_eq.append( C_blackbox[-1]) # Add last output of ob attr_eq_counter += 1 # For skipping block_table function in first time # Loop for adding more columns/attributes into Attr Equ blocker while (True): # List column names print("\n- List of columns: ") print(list(A.columns)) # Get blocking attribute/column print( "\n- Which column (w/ index) to use for equivalence blocking? (ex. 1):" ) blocking_col_index = input() blocking_col = A.columns[int(blocking_col_index)] print( "\n- Do you want to add missing values into blocking? (y or n):" ) add_missing_val = input() if (add_missing_val == 'y'): add_missing_val = True else: add_missing_val = False # First time using Attr Equ blocker, use A and B if (attr_eq_counter == 0): # Block using selected (blocking_col) attribute on A and B C_attr_eq.append( ab.block_tables(A, B, blocking_col, blocking_col, l_output_attrs=out_attr, r_output_attrs=out_attr, l_output_prefix=l_prefix, r_output_prefix=r_prefix, allow_missing=add_missing_val, n_jobs=-1)) # Not first time, add new constraint into previous candidate set else: # Block using selected (blocking_col) attribute on previous (last=-1) candidate set C_attr_eq.append( ab.block_candset(C_attr_eq[-1], l_block_attr=blocking_col, r_block_attr=blocking_col, allow_missing=add_missing_val, n_jobs=-1, show_progress=False)) # DEBUG BLOCKING print( "\n- Attribute Equivalence Blocker Debugging...\n") # Debug last blocker output dbg = em.debug_blocker(C_attr_eq[-1], A, B, output_size=200, n_jobs=-1) # Display first few tuple pairs from the debug_blocker's output print("\n- Blocking debug results:") print(dbg.head()) attr_eq_counter += 1 # Increase the counter # Continue to use Attribute Equivalence Blocker or not print("\n- Length of candidate set: " + str(len(C_attr_eq[-1]))) print( "- Add another column into Attribute Equivalence Blocker[a] OR Reset last blocker's output[r]:" ) ab_next_operation = input() if (not ab_next_operation.islower()): ab_next_operation = ab_next_operation.lower( ) # Lower case # Continue using Attribute Equivalence Blocker if (ab_next_operation == 'a'): continue # Reset/remove last blocker's output from candidate set list elif (ab_next_operation == 'r'): del C_attr_eq[-1] print("\n- Last blocker output removed!") print( "- Continue to use Attribute Equivalence Blocker (y or n):" ) ab_next_operation = input() if (ab_next_operation == 'n'): break # Finish Attribute Equivalence Blocker else: break # ----- Overlap Blocker ----- elif (blocker_selection == 'ob'): # Create attribute equivalence blocker ob = em.OverlapBlocker() # Counter for indexes overlap_counter = 0 # Check if Attribute Equivalence Blocker used before if (C_attr_eq and not C_attr_eq[-1].empty): print( "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_overlap.append( C_attr_eq[-1]) # Add last output of ab overlap_counter += 1 # For skipping block_table function in first time # Check if BlackBox Blocker used before if (C_blackbox and not C_blackbox[-1].empty): print( "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_overlap.append( C_blackbox[-1]) # Add last output of ob overlap_counter += 1 # For skipping block_table function in first time # Loop for adding more columns/attributes into Overlap blocker while (True): # List column names print("- List of columns: ") print(list(A.columns)) # Get blocking attribute/column print( "- Which column (w/ index) to use for overlap blocking? (ex. 1):" ) blocking_col_index = input() blocking_col = A.columns[int(blocking_col_index)] print( "\n- Do you want to add missing values into blocking? (y or n):" ) add_missing_val = input() if (add_missing_val == 'y'): add_missing_val = True else: add_missing_val = False print("\n- Use words as a token? (y or n):") use_world_level = input() if (use_world_level == 'y'): use_world_level = True q_gram_value = None else: use_world_level = False print( "\n- Q-gram q value (ex. 2 --> JO HN SM IT H):" ) q_gram_value = input() q_gram_value = int(q_gram_value) print( "\n- Enter the overlap size (# of tokens that overlap):" ) overlap_size = input() overlap_size = int(overlap_size) print( "\n- Do you want to remove (a, an, the) from token set? (y or n):" ) use_stop_words = input() if (use_stop_words == 'y'): use_stop_words = True else: use_stop_words = False # First time using Overlap blocker, use A and B if (overlap_counter == 0): # Block using selected (blocking_col) attribute on A and B C_overlap.append( ob.block_tables(A, B, blocking_col, blocking_col, l_output_attrs=out_attr, r_output_attrs=out_attr, l_output_prefix=l_prefix, r_output_prefix=r_prefix, rem_stop_words=use_stop_words, q_val=q_gram_value, word_level=use_world_level, overlap_size=overlap_size, allow_missing=add_missing_val, n_jobs=-1)) # Not first time, add new constraint into previous candidate set else: # Block using selected (blocking_col) attribute on previous (last=-1) candidate set C_overlap.append( ob.block_candset(C_overlap[-1], l_overlap_attr=blocking_col, r_overlap_attr=blocking_col, rem_stop_words=use_stop_words, q_val=q_gram_value, word_level=use_world_level, overlap_size=overlap_size, allow_missing=add_missing_val, n_jobs=-1, show_progress=False)) # DEBUG BLOCKING print("\n- Overlap Blocker Debugging...\n") # Debug last blocker output dbg = em.debug_blocker(C_overlap[-1], A, B, output_size=200, n_jobs=-1) # Display first few tuple pairs from the debug_blocker's output print("\n- Blocking debug results:") print(dbg.head()) overlap_counter += 1 # Increase the counter # Continue to use Attribute Equivalence Blocker or not print("\n- Length of candidate set: " + str(len(C_overlap[-1]))) print( "- Add another column into Overlap Blocker[a] OR Reset last blocker's output[r]:" ) ob_next_operation = input() if (not ob_next_operation.islower()): ob_next_operation = ob_next_operation.lower( ) # Lower case # Continue using Overlap Blocker if (ob_next_operation == 'a'): continue # Reset/remove last blocker's output from candidate set list elif (ob_next_operation == 'r'): del C_overlap[-1] print("\n- Last blocker output removed!") print( "- Continue to use Overlap Blocker (y or n):") ob_next_operation = input() if (ob_next_operation == 'n'): break # Finish Overlap Blocker else: break # ----- BlackBox Blocker ----- elif (blocker_selection == 'bb'): # Create attribute equivalence blocker bb = em.BlackBoxBlocker() # Counter for indexes blackbox_counter = 0 # Check if Overlap Blocker used before if (C_attr_eq and not C_attr_eq[-1].empty): print( "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_blackbox.append( C_attr_eq[-1]) # Add last output of ob blackbox_counter += 1 # For skipping block_table function in first time # Check if Overlap Blocker used before if (C_overlap and not C_overlap[-1].empty): print( "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_blackbox.append( C_overlap[-1]) # Add last output of ob blackbox_counter += 1 # For skipping block_table function in first time # Loop for adding more columns/attributes into BlackBox blocker while (True): # Set function bb.set_black_box_function( number_10_percent_comparision) # First time using Overlap blocker, use A and B if (overlap_counter == 0): # Block on A and B C_blackbox.append( bb.block_tables(A, B, l_output_attrs=out_attr, r_output_attrs=out_attr, l_output_prefix=l_prefix, r_output_prefix=r_prefix, n_jobs=-1, show_progress=False)) # Not first time, add new constraint into previous candidate set else: # Block on previous (last=-1) candidate set C_blackbox.append( bb.block_candset(C_blackbox[-1], n_jobs=-1, show_progress=False)) # DEBUG BLOCKING print("\n- BlackBox Blocker Debugging...\n") # Debug last blocker output dbg = em.debug_blocker(C_blackbox[-1], A, B, output_size=200, n_jobs=-1) # Display first few tuple pairs from the debug_blocker's output print("\n- Blocking debug results:") print(dbg.head()) blackbox_counter += 1 # Increase the counter # Continue to use Attribute Equivalence Blocker or not print("\n- Length of candidate set: " + str(len(C_blackbox[-1]))) print( "- Add another column into BlackBox Blocker[a] OR Reset last blocker's output[r]:" ) bb_next_operation = input() if (not bb_next_operation.islower()): bb_next_operation = bb_next_operation.lower( ) # Lower case # Continue using Overlap Blocker if (bb_next_operation == 'a'): continue # Reset/remove last blocker's output from candidate set list elif (bb_next_operation == 'r'): del C_blackbox[-1] print("\n- Last blocker output removed!") print( "- Continue to use BlackBox Blocker (y or n):") bb_next_operation = input() if (bb_next_operation == 'n'): break # Finish BlackBox Blocker else: break print("\n- Do you want to add/use another blocker? (y or n):") blocker_decision = input() if (blocker_decision == 'n'): break print( "\n- Which blocker output you want to use? (Attr Equ-ab, Overlap-ob, BlackBox-bb, Union-un)" ) blocker_output_selection = input() # Attribute Equ if (blocker_output_selection == "ab"): C = C_attr_eq[-1] # Overlap elif (blocker_output_selection == "ob"): C = C_overlap[-1] # Overlap elif (blocker_output_selection == "bb"): C = C_blackbox[-1] # Union of blockers elif (blocker_output_selection == "un"): # Combine/union blockers candidate sets print("\n- TODO: Unions Attr Equ and Overlap only!") if (C_attr_eq and C_overlap and not C_attr_eq[-1].empty and not C_overlap[-1].empty): # Both blocker types used C = em.combine_blocker_outputs_via_union( [C_attr_eq[-1], C_overlap[-1]]) print( "\n- Blockers candidate set outputs combined via union." ) else: # Error C = [] print( "\n- ERROR: Candidate set C is empty! Check blockers' results." ) # Error else: C = [] print( "\n- ERROR: Candidate set C is empty! Check blockers' results." ) print("\n- Length of C: " + str(len(C))) else: print( "\n- 2 Tables column names are different, they must be the same" ) print(list(A.columns)) print(list(B.columns)) # SAMPLING&LABELING print("\n-------------SAMPLING&LABELING-------------\n") print("- Choose sampling size (eg. 450):") sampling_size = input() while (int(sampling_size) > len(C)): print("- Sampling size cannot be bigger than " + str(len(C))) sampling_size = input() # Sample candidate set S = em.sample_table(C, int(sampling_size)) print("- New window will pop-up for " + sampling_size + " sized table.") print("- If there is a match, change tuple's label value to 1") # Label S G = em.label_table(S, 'label') #DEVELOPMENT AND EVALUATION print("\n-------------DEVELOPMENT AND EVALUATION-------------\n") # Split S into development set (I) and evaluation set (J) IJ = em.split_train_test(G, train_proportion=0.7, random_state=0) I = IJ['train'] J = IJ['test'] #SELECTING THE BEST MATCHER print("\n-------------SELECTING THE BEST MATCHER-------------\n") # Create a set of ML-matchers dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') nb = em.NBMatcher(name='NaiveBayes') print( "\n- 6 different ML-matchers created: DL, SVM, RF, LogReg, LinReg, NB") print("\n- Creating features...") # Generate features feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) print("\n- Features list:") # List the names of the features generated print(feature_table['feature_name']) print("\n- Converting the development set to feature vectors...") # Convert the I into a set of feature vectors using feature_table H = em.extract_feature_vecs(I, feature_table=feature_table, attrs_after='label', show_progress=False) print("\n- Feature table first rows:") # Display first few rows print(H.head()) # Primary key of tables = prefix + pk = l_id, r_id ltable_pk = l_prefix + pk_A rtable_pk = r_prefix + pk_B # Check if the feature vectors contain missing values # A return value of True means that there are missing values is_missing_values = any(pd.notnull(H)) print("\n- Does feature vector have missing values: " + str(is_missing_values)) if (is_missing_values): # Impute feature vectors with the mean of the column values. H = em.impute_table( H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], strategy='mean', val_all_nans=0.0) #print("\n- Feature table first rows:") # Display first few rows #print(H.head()) print("- Impute table function used for missing values.") print("\n- Selecting the best matcher using cross-validation...") # Select the best ML matcher using CV result = em.select_matcher( matchers=[dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], k=5, target_attr='label', metric_to_select_matcher='f1', random_state=0) print("\n- Results:") print(result['cv_stats']) #DEBUGGING THE MATCHER print("\n-------------DEBUGGING THE MATCHER-------------\n") # Split feature vectors into train and test UV = em.split_train_test(H, train_proportion=0.5) U = UV['train'] V = UV['test'] # Debug decision tree using GUI em.vis_debug_rf(rf, U, V, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], target_attr='label') print("\n- Do you want to add another feature?") H = em.extract_feature_vecs(I, feature_table=feature_table, attrs_after='label', show_progress=False) # Check if the feature vectors contain missing values # A return value of True means that there are missing values is_missing_values = any(pd.notnull(H)) print("\n- Does feature vector have missing values: " + str(is_missing_values)) if (is_missing_values): # Impute feature vectors with the mean of the column values. H = em.impute_table( H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], strategy='mean') print("\n- Feature table first rows:") # Display first few rows print(H.head()) # Select the best ML matcher using CV result = em.select_matcher( [dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], k=5, target_attr='label', metric_to_select_matcher='f1', random_state=0) print("\n- Results:") print(result['cv_stats']) #EVALUATING THE MATCHING OUTPUT print("\n-------------EVALUATING THE MATCHING OUTPUT-------------\n") print("\n- Converting the evaluation set to feature vectors...") # Convert J into a set of feature vectors using feature table L = em.extract_feature_vecs(J, feature_table=feature_table, attrs_after='label', show_progress=False) # Check if the feature vectors contain missing values # A return value of True means that there are missing values is_missing_values = any(pd.notnull(L)) print("\n- Does feature vector have missing values: " + str(is_missing_values)) if (is_missing_values): # Impute feature vectors with the mean of the column values. L = em.impute_table( L, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], strategy='mean') print("\n- Feature table first rows:") # Display first few rows print(L.head()) print("\n- Training the selected matcher...") # Train using feature vectors from I rf.fit(table=H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], target_attr='label') print("\n- Predicting the matches...") # Predict on L predictions = rf.predict( table=L, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], append=True, target_attr='predicted', inplace=False) print("\n- Evaluating the prediction...") # Evaluate the predictions eval_result = em.eval_matches(predictions, 'label', 'predicted') print(em.print_eval_summary(eval_result)) print("\n- Time elapsed:") print(datetime.now() - startTime) print("\n-------------END-------------\n")