コード例 #1
0
def predict_matching_tuples(A, B, C, G):
    # Split G into I and J for CV
    IJ = em.split_train_test(G, train_proportion=0.5, random_state=0)
    I = IJ['train']
    # Generate features set F
    F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
    # Convert G to a set of feature vectors using F
    H = em.extract_feature_vecs(I,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    excluded_attributes = ['_id', 'l_id', 'r_id', 'label']
    # Fill in missing values with column's average
    H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean')
    # Create and train a logistic regression - the best matcher from stage3.
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    # Convert C into a set of features using F
    L = em.extract_feature_vecs(C, feature_table=F, show_progress=False)
    # Fill in missing values with column's average
    L = em.impute_table(L,
                        exclude_attrs=['_id', 'l_id', 'r_id'],
                        strategy='mean')
    # Predict on L with trained matcher
    predictions = lg.predict(table=L,
                             exclude_attrs=['_id', 'l_id', 'r_id'],
                             append=True,
                             target_attr='predicted',
                             inplace=False,
                             return_probs=False,
                             probs_attr='proba')
    # Extract the matched pairs' ids
    matched_pairs = predictions[predictions.predicted == 1]
    matched_ids = matched_pairs[['l_id', 'r_id']]
    # Save matched_pairs to file so we don't have to train and predict each time the code is executed
    matched_ids.to_csv(FOLDER + 'predictedMatchedIDs.csv', index=False)
コード例 #2
0
                          match_c, match_t, match_s)

# generating feature vectors
H = em.extract_feature_vecs(dev_set,
                            feature_table=match_f,
                            attrs_after='label',
                            show_progress=False)

# filling missing values in feature vectors
H.fillna(value=0, inplace=True)

# creating a set of learning-based matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

# Selecting the best matcher using cross-validation

# precision of matchers for 5-fold cross validations
result_p = em.select_matcher(
    [dt, svm, rf, lg, ln, nb],
    table=H,
    exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
    k=5,
    target_attr='label',
    metric='precision',
    random_state=0)
result_p['cv_stats']
コード例 #3
0
ファイル: matcher.py プロジェクト: is4ac/cs839-data-science
def main():
    # Read in data files
    A = em.read_csv_metadata(FOLDER + 'A.csv', key='id')  # imdb data
    B = em.read_csv_metadata(FOLDER + 'B.csv', key='id')  # tmdb data
    G = em.read_csv_metadata(FOLDER + 'G.csv',
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='l_id',
                             fk_rtable='r_id')  # labeled data
    # Split G into I and J for CV
    IJ = em.split_train_test(G, train_proportion=0.5, random_state=0)
    I = IJ['train']
    J = IJ['test']
    # Save I and J to files
    I.to_csv(FOLDER + 'I.csv', index=False)
    J.to_csv(FOLDER + 'J.csv', index=False)
    # Generate features set F
    F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
    #print(F.feature_name)
    #print(type(F))
    # Convert I to a set of feature vectors using F
    H = em.extract_feature_vecs(I,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    #print(H.head)
    # Check of missing values
    #print(any(pd.notnull(H)))
    excluded_attributes = ['_id', 'l_id', 'r_id', 'label']
    # Fill in missing values with column's average
    H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean')
    # Create a set of matchers
    dt = em.DTMatcher(name='DecisionTree', random_state=0)
    svm = em.SVMMatcher(name='SVM', random_state=0)
    rf = em.RFMatcher(name='RF', random_state=0)
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    ln = em.LinRegMatcher(name='LinReg')
    nb = em.NBMatcher(name='NaiveBayes')
    # Selecting best matcher with CV using F1-score as criteria
    CV_result = em.select_matcher([dt, rf, svm, ln, lg, nb],
                                  table=H,
                                  exclude_attrs=excluded_attributes,
                                  k=10,
                                  target_attr='label',
                                  metric_to_select_matcher='f1',
                                  random_state=0)
    print(CV_result['cv_stats'])  # RF is the best matcher
    # Train matchers on H
    dt.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    rf.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    svm.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    ln.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    nb.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    # Convert J into a set of features using F
    L = em.extract_feature_vecs(J,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    # Fill in missing values with column's average
    L = em.impute_table(L, exclude_attrs=excluded_attributes, strategy='mean')
    # Predict on L with trained matchers
    predictions_dt = dt.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_rf = rf.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_svm = svm.predict(table=L,
                                  exclude_attrs=excluded_attributes,
                                  append=True,
                                  target_attr='predicted',
                                  inplace=False,
                                  return_probs=False,
                                  probs_attr='proba')
    predictions_lg = lg.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_ln = ln.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_nb = nb.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    # Evaluate predictions
    dt_eval = em.eval_matches(predictions_dt, 'label', 'predicted')
    em.print_eval_summary(dt_eval)
    rf_eval = em.eval_matches(predictions_rf, 'label', 'predicted')
    em.print_eval_summary(rf_eval)
    svm_eval = em.eval_matches(predictions_svm, 'label', 'predicted')
    em.print_eval_summary(svm_eval)
    lg_eval = em.eval_matches(predictions_lg, 'label', 'predicted')
    em.print_eval_summary(lg_eval)
    ln_eval = em.eval_matches(predictions_ln, 'label', 'predicted')
    em.print_eval_summary(ln_eval)
    nb_eval = em.eval_matches(predictions_nb, 'label', 'predicted')
    em.print_eval_summary(nb_eval)
コード例 #4
0
def run_magellan_models(sampler,
                        blocking,
                        lsh_args,
                        sequential_args,
                        return_prob_estimates=True):
    '''
    1. Loads data from processed folder of dataset choice.
    2. Performs blocking according to given hyper-parameters
    3. For every given blocking set, generate automatic features
    4. Run suite of shallow learning algorithms on candidate sets

    Inputs:
            sampler: sampling technique that was used to generate data: iterative or naive
            blocking: blocking algorithm used: iterative or lsh
            lsh_args = dictionary: seeds, char_ngrams, bands --> dictionary
            sequential_args: cutoff_distance , min_shared_tokens
    Outputs:
            training_pred_dict, validation_pred_dict, test_pred_dict, pre_blocked_all_sets_labels, post_blocked_all_sets_labels
    
    '''
    if (sampler != "iterative") & (sampler != "naive"):
        raise ValueError(
            "Sampler should be iterative or naive (completely random).")

    # Load Training Set according to sampler
    em.del_catalog()
    lhs_table = em.read_csv_metadata(
        "../data/processed_amazon_google/amz_google_" + sampler +
        "_X_train_lhs.csv").rename(columns={"Unnamed: 0": "id_lhs"})
    rhs_table = em.read_csv_metadata(
        "../data/processed_amazon_google/amz_google_" + sampler +
        "_X_train_rhs.csv").rename(columns={"Unnamed: 0": "id_rhs"})
    y_train = pd.read_csv("../data/processed_amazon_google/amz_google_" +
                          sampler + "_y_train.csv")
    em.del_catalog()
    em.set_key(lhs_table, "id_lhs")
    em.set_key(rhs_table, "id_rhs")

    n_train = lhs_table.shape[0]

    # Blocking
    blocking_cols = ["title_amzn", "title_g"]
    feature_cols = [[
        'title_amzn', 'description_amzn', 'manufacturer_amzn', 'price_amzn'
    ], ['title_g', 'description_g', 'manufacturer_g', 'price_g']]
    id_names = ["id_amzn", "id_g"]
    lsh_blocking_col_ids = 1

    print("Blocking Train Set")
    if (blocking == "lsh"):
        # [1,2] hashes on title and description
        candidates = lsh_blocking(lhs_table,
                                  rhs_table,
                                  lsh_blocking_col_ids,
                                  5, ["id_amzn", "id_g"],
                                  char_ngram=lsh_args["char_ngram"],
                                  seeds=lsh_args["seeds"],
                                  bands=lsh_args["bands"])
    elif (blocking == "sequential"):
        # Initial Rough Blocking on Overlapped Attributes
        candidates = overlapped_attribute_blocking(
            lhs_table, rhs_table, blocking_cols,
            sequential_args["min_shared_tokens"], feature_cols, id_names)
        # Fine Grained Blocking on edit distance
        candidates = edit_distance_blocking(None, None, blocking_cols,
                                            sequential_args["cutoff_distance"],
                                            True, candidates)
    else:
        raise ValueError("Blocking must be lsh or sequential")

    print(f"Generated Candidate size has {candidates.shape[0]} rows")

    # Generate Features
    id_names_phrase = ["_amzn",
                       "_g"]  # Trims away these suffixes from id columns
    feature_cols = [
        [
            'title_amzn',
            'description_amzn',  # removed manufacturer due to missingess: produces features with nans
            'price_amzn'
        ],
        ['title_g', 'description_g', 'price_g']
    ]

    generated_df_train = automatic_feature_gen(candidates, feature_cols,
                                               id_names, id_names_phrase)
    generated_df_train = pd.merge(generated_df_train,
                                  y_train,
                                  left_on=["id_amzn", "id_g"],
                                  right_on=["id_amzn", "id_g"],
                                  how="left")
    generated_df_train.y = generated_df_train.y.map({1.0: int(1), np.nan: 0})

    # Store Training Column names. Ensures that if by chance a new column is generated in
    # validation or test phase, these ones will be ignored
    model_features = generated_df_train.columns

    # If only one class is present in blocking stage, skip training a matcher as it would be impossible
    # Essentially label all blocked tuples as being a match
    train_matchers = True
    if (len(generated_df_train.y.unique())) <= 1:
        train_matchers = False
        print(
            f"Train Candidate Pairs only consist of one class. Skipping matcher training and setting blocker as a matcher."
        )

    if train_matchers:
        # Train Models on training set
        #dt = em.DTMatcher(name='DecisionTree', random_state=0)
        #svm = em.SVMMatcher(name='SVM', random_state=0)
        rf = em.RFMatcher(name='RF', random_state=0)
        lg = em.LogRegMatcher(name='LogReg', random_state=0)
        xg = em.XGBoostMatcher(name="Xg-Boost", random_state=0)

        # dt.fit(table = generated_df_train,
        #         exclude_attrs=['index', 'id_amzn','id_g','index_num_lhs', 'index_num_rhs'],
        #         target_attr='y')
        # svm.fit(table = generated_df_train,
        #         exclude_attrs=['index', 'id_amzn','id_g','index_num_lhs', 'index_num_rhs'],
        #         target_attr='y')
        rf.fit(table=generated_df_train,
               exclude_attrs=[
                   'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs'
               ],
               target_attr='y')
        lg.fit(table=generated_df_train,
               exclude_attrs=[
                   'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs'
               ],
               target_attr='y')
        xg.fit(table=generated_df_train,
               exclude_attrs=[
                   'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs'
               ],
               target_attr='y')

        models = [rf, lg, xg]

        training_predictions = {}
        for model in models:
            training_predictions[model.name] = model.predict(
                table=generated_df_train,
                exclude_attrs=[
                    'index', 'id_amzn', 'id_g', 'index_num_lhs',
                    'index_num_rhs', "y"
                ],
                return_probs=return_prob_estimates)

    # Load validation Set + Generate the feature columns
    em.del_catalog()
    lhs_table = em.read_csv_metadata(
        "../data/processed_amazon_google/amz_google_" + sampler +
        "_X_valid_lhs.csv").rename(columns={"Unnamed: 0": "id_lhs"})
    rhs_table = em.read_csv_metadata(
        "../data/processed_amazon_google/amz_google_" + sampler +
        "_X_valid_rhs.csv").rename(columns={"Unnamed: 0": "id_rhs"})
    y_valid = pd.read_csv("../data/processed_amazon_google/amz_google_" +
                          sampler + "_y_valid.csv")
    em.del_catalog()
    em.set_key(lhs_table, "id_lhs")
    em.set_key(rhs_table, "id_rhs")

    n_valid = lhs_table.shape[0]
    print("Blocking Validation Set")
    if (blocking == "lsh"):
        candidates = lsh_blocking(lhs_table,
                                  rhs_table,
                                  lsh_blocking_col_ids,
                                  5, ["id_amzn", "id_g"],
                                  char_ngram=lsh_args["char_ngram"],
                                  seeds=lsh_args["seeds"],
                                  bands=lsh_args["bands"])
    elif (blocking == "sequential"):
        # Initial Rough Blocking on Overlapped Attributes
        candidates = overlapped_attribute_blocking(
            lhs_table, rhs_table, blocking_cols,
            sequential_args["min_shared_tokens"], feature_cols, id_names)
        # Fine Grained Blocking on edit distance
        candidates = edit_distance_blocking(None, None, blocking_cols,
                                            sequential_args["cutoff_distance"],
                                            True, candidates)
    else:
        raise ValueError("Blocking must be lsh or sequential")

    generated_df_valid = automatic_feature_gen(candidates, feature_cols,
                                               id_names, id_names_phrase)
    generated_df_valid = pd.merge(generated_df_valid,
                                  y_valid,
                                  left_on=["id_amzn", "id_g"],
                                  right_on=["id_amzn", "id_g"],
                                  how="left")
    generated_df_valid.y = generated_df_valid.y.map({1.0: int(1), np.nan: 0})
    generated_df_valid = generated_df_valid.loc[:, model_features]
    ## TODO: think of a better idea!! it is because we enforce all generated data sets to have same columns as training set
    generated_df_valid = generated_df_valid.fillna(0)

    if train_matchers:
        # Predict on Validation Set
        validation_predictions = {}
        for model in models:
            validation_predictions[model.name] = model.predict(
                table=generated_df_valid,
                exclude_attrs=[
                    'index', 'id_amzn', 'id_g', 'index_num_lhs',
                    'index_num_rhs', "y"
                ],
                return_probs=return_prob_estimates)

        # Retrain on all data
        generated_final_train = pd.concat(
            [generated_df_train, generated_df_valid], axis=0)

        # dt.fit(table = generated_final_train,
        #         exclude_attrs=['index', 'id_amzn','id_g','index_num_lhs', 'index_num_rhs'],
        #         target_attr='y')
        # svm.fit(table = generated_final_train,
        #         exclude_attrs=['index', 'id_amzn','id_g','index_num_lhs', 'index_num_rhs'],
        #         target_attr='y')
        rf.fit(table=generated_final_train,
               exclude_attrs=[
                   'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs'
               ],
               target_attr='y')
        lg.fit(table=generated_final_train,
               exclude_attrs=[
                   'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs'
               ],
               target_attr='y')
        xg.fit(table=generated_final_train,
               exclude_attrs=[
                   'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs'
               ],
               target_attr='y')

    # Finally Generate Test Set Predictions
    em.del_catalog()
    lhs_table = em.read_csv_metadata(
        "../data/processed_amazon_google/amz_google_" + sampler +
        "_X_test_lhs.csv").rename(columns={"Unnamed: 0": "id_lhs"})
    rhs_table = em.read_csv_metadata(
        "../data/processed_amazon_google/amz_google_" + sampler +
        "_X_test_rhs.csv").rename(columns={"Unnamed: 0": "id_rhs"})
    y_test = pd.read_csv("../data/processed_amazon_google/amz_google_" +
                         sampler + "_y_test.csv")
    em.del_catalog()
    em.set_key(lhs_table, "id_lhs")
    em.set_key(rhs_table, "id_rhs")

    n_test = lhs_table.shape[0]
    print("Blocking Test Set")
    if (blocking == "lsh"):
        candidates = lsh_blocking(lhs_table,
                                  rhs_table,
                                  lsh_blocking_col_ids,
                                  5, ["id_amzn", "id_g"],
                                  char_ngram=lsh_args["char_ngram"],
                                  seeds=lsh_args["seeds"],
                                  bands=lsh_args["bands"])
    elif (blocking == "sequential"):
        # Initial Rough Blocking on Overlapped Attributes
        candidates = overlapped_attribute_blocking(
            lhs_table, rhs_table, blocking_cols,
            sequential_args["min_shared_tokens"], feature_cols, id_names)
        # Fine Grained Blocking on edit distance
        candidates = edit_distance_blocking(None, None, blocking_cols,
                                            sequential_args["cutoff_distance"],
                                            True, candidates)
    else:
        raise ValueError("Blocking must be lsh or sequential")

    generated_df_test = automatic_feature_gen(candidates, feature_cols,
                                              id_names, id_names_phrase)
    generated_df_test = pd.merge(generated_df_test,
                                 y_test,
                                 left_on=["id_amzn", "id_g"],
                                 right_on=["id_amzn", "id_g"],
                                 how="left")
    generated_df_test.y = generated_df_test.y.map({1.0: int(1), np.nan: 0})
    generated_df_test = generated_df_test.loc[:, model_features]
    generated_df_test = generated_df_test.fillna(0)

    if train_matchers:
        # Predict on test Set
        test_predictions = {}
        for model in models:
            print(model.name)
            test_predictions[model.name] = model.predict(
                table=generated_df_test,
                exclude_attrs=[
                    'index', 'id_amzn', 'id_g', 'index_num_lhs',
                    'index_num_rhs', "y"
                ],
                return_probs=return_prob_estimates)

    # Create pre_blocked_all_sets_labels to store truth of candidate tuples after BLOCKING
    pre_blocked_all_sets_labels = {
        "train": y_train,
        "valid": y_valid,
        "test": y_test
    }
    post_blocked_all_sets_labels = {
        "train": generated_df_train[["id_amzn", "id_g", "y"]],
        "valid": generated_df_valid[["id_amzn", "id_g", "y"]],
        "test": generated_df_test[["id_amzn", "id_g", "y"]]
    }

    if (blocking == "lsh"):
        metadata = lsh_args
    else:
        metadata = sequential_args
    print(
        "-----------------------------------------------------------------------------"
    )
    print(
        f"Finished Experiment using {sampler} and {blocking} with params: {metadata} where train_matchers is: {train_matchers}"
    )
    print(
        "-----------------------------------------------------------------------------"
    )
    # Add in sample sizes
    metadata["n_train"] = n_train
    metadata["n_valid"] = n_valid
    metadata["n_test"] = n_test
    metadata["sampler"] = sampler
    metadata["blocking"] = blocking

    # return matcher predictions if train_matchers occurs otherwise return predictions via the blocker
    if train_matchers:
        return (training_predictions, validation_predictions, test_predictions,
                pre_blocked_all_sets_labels, post_blocked_all_sets_labels,
                metadata)
    else:
        training_predictions, validation_predictions, test_predictions = blocker_as_matcher(
            n_train, n_valid, n_test)
        return (training_predictions, validation_predictions, test_predictions,
                pre_blocked_all_sets_labels, post_blocked_all_sets_labels,
                metadata)
コード例 #5
0
# Save Set I
#em.to_csv_metadata(I, './TableI.csv')
# Save Set J
#em.to_csv_metadata(J, './TableJ.csv')

# Automatic feature generation
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
H = em.extract_feature_vecs(I, feature_table=F, attrs_after=['gold_labels'])
# Fill missing values
H.fillna(value='NaN', inplace=True)

# Create ML matchers
dt = em.DTMatcher(name='DecisionTree')
svm = em.SVMMatcher(name='SVM')
rf = em.RFMatcher(name='RandomForest')
lg = em.LogRegMatcher(name='LogisticRegression')
ln = em.LinRegMatcher(name='LinearRegression')
nb = em.NBMatcher(name='NaiveBayes')
# Select the best matcher
result = em.select_matcher(
    [dt, rf, svm, ln, lg, nb],
    table=H,
    exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold_labels'],
    k=5,
    target_attr='gold_labels',
    metric_to_select_matcher='f1')
print(result['cv_stats'])
best_matcher = result['selected_matcher']

# Evaluate the matcher
L = em.extract_feature_vecs(J, feature_table=F, attrs_after=['gold_labels'])
コード例 #6
0
# * Creating a set of learning-based matchers
# * Creating features
# * Converting the development set into feature vectors
# * Selecting the best learning-based matcher using k-fold cross validation
#

# Creating a set of learning-based matchers
# ------------------

# In[111]:

## Create a set of ML Matchers
dt = em.DTMatcher()
rf = em.RFMatcher()
nb = em.NBMatcher()
logreg = em.LogRegMatcher()
linreg = em.LinRegMatcher()
svm = em.SVMMatcher()

# Creating features
# ------------------

# Next, we need to create a set of features for the development set. py_entitymatching provides a way to automatically generate features based on the attributes in the input tables. We drop the unwanted features from the feature table

# In[113]:

## Generate features
match_f = em.get_features_for_matching(A, B)
match_f.drop([13, 14, 15, 16], inplace=True)

# In[114]:
コード例 #7
0
ファイル: merge.py プロジェクト: is4ac/cs839-data-science
def main():
    # Read in data files
    A = em.read_csv_metadata(FOLDER + 'A.csv', key='id')  # imdb data
    B = em.read_csv_metadata(FOLDER + 'B.csv', key='id')  # tmdb data
    G = em.read_csv_metadata(FOLDER + 'G.csv',
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='l_id',
                             fk_rtable='r_id')  # labeled data
    # Split G into I and J for CV
    IJ = em.split_train_test(G, train_proportion=0.5, random_state=0)
    I = IJ['train']
    # Generate features set F
    F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
    # Convert I to a set of feature vectors using F
    H = em.extract_feature_vecs(I,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    excluded_attributes = ['_id', 'l_id', 'r_id', 'label']
    # Fill in missing values with column's average
    H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean')
    # Create and train a logistic regression - the best matcher from stage3.
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    # Read in the candidate tuple pairs.
    C = em.read_csv_metadata(FOLDER + 'C.csv',
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='l_id',
                             fk_rtable='r_id')  # labeled data
    # Convert C into a set of features using F
    L = em.extract_feature_vecs(C, feature_table=F, show_progress=False)
    # Fill in missing values with column's average
    L = em.impute_table(L,
                        exclude_attrs=['_id', 'l_id', 'r_id'],
                        strategy='mean')
    # Predict on L with trained matcher
    predictions = lg.predict(table=L,
                             exclude_attrs=['_id', 'l_id', 'r_id'],
                             append=True,
                             target_attr='predicted',
                             inplace=False,
                             return_probs=False,
                             probs_attr='proba')
    # Output the merged table (Basically what matches).
    # We start with rows from A that matches.
    # We then merge value from B into A.
    matched_pairs = predictions[predictions.predicted == 1]
    left_ids = matched_pairs['l_id'].to_frame()
    left_ids.columns = ['id']
    merged = pd.merge(A, left_ids, on='id')
    merged.set_index('id', inplace=True)
    B.set_index('id', inplace=True)
    black_list = {'a872', 'a987'}
    for pair in matched_pairs.itertuples():
        aid = pair.l_id
        bid = pair.r_id
        if (aid in black_list):
            continue
        # Title: keep title from A, if title from B is not an exact matched
        # from A, append B’s title to the alternative title field if B’s title
        # is not already in A’s alternative title.
        m_title = merged.loc[aid, 'title']
        a_title = merged.loc[aid, 'title']
        b_title = B.loc[bid, 'title']
        if (b_title != a_title):
            if pd.isnull(merged.loc[aid, 'alternative_titles']):
                merged.loc[aid, 'alternative_titles'] = b_title
            else:
                alt = set(merged.loc[aid, 'alternative_titles'].split(';'))
                if (b_title not in alt):
                    merged.loc[aid, 'alternative_titles'] += ';' + b_title
        for col in [
                'directors', 'writers', 'cast', 'genres', 'keywords',
                'languages', 'production_companies', 'production_countries'
        ]:
            merged.loc[aid, col] = merge_cell(merged.loc[aid, col], B.loc[bid,
                                                                          col])
        # Content rating: keep A
        # Release year: keep A
        # Opening_weekend_revenue: keep A
        # Run time
        m_runtime = int(
            (merged.loc[aid, 'run_time'] + B.loc[bid, 'run_time']) / 2)
        merged.loc[aid, 'run_time'] = m_runtime
        # Budget and Revenue
        for col in ['budget', 'revenue']:
            merged.loc[aid, col] = merge_money(merged.loc[aid, col],
                                               B.loc[bid, col])
        # Rating: take the average after converting B rating to scale 10.
        m_rating = (merged.loc[aid, 'rating'] + 0.1 * B.loc[bid, 'rating']) / 2
        merged.loc[aid, 'rating'] = m_rating
    merged.to_csv(FOLDER + 'E.csv', index=True)
コード例 #8
0
def main():
    # WELCOME TO MY MAGELLAN RUN SCRIPT
    print("\n-------------WELCOME TO MY MAGELLAN RUN SCRIPT-------------\n")

    # Get the datasets directory
    datasets_dir = 'B:\McMaster\CAS 764 - Advance Topics in Data Management\Project\Data\\'
    print("- Dataset directory: " + datasets_dir)
    print("- List of folders/files: ")
    print(os.listdir(datasets_dir))
    print("- Please enter new dataset folder name:")
    datasets_dir += input()
    print("- Dataset directory set to: " + datasets_dir)

    dateset_dir_files = os.listdir(datasets_dir)
    print("- List of files in dataset folder: ")
    print(dateset_dir_files)

    # Get the path of the input table A
    print("- Enter an index for Table A file (0-x):")
    file_index_A = input()
    filename_A = dateset_dir_files[int(file_index_A)]
    print("Table A file set to: " + filename_A)

    # Get the path of the input table
    path_A = datasets_dir + os.sep + filename_A

    # Get the path of the input table B
    print("- Enter an index for Table B file (0-x):")
    file_index_B = input()
    filename_B = dateset_dir_files[int(file_index_B)]
    print("Table B file set to: " + filename_B)

    # Get the path of the input table
    path_B = datasets_dir + os.sep + filename_B

    # Print Table A column names
    A = em.read_csv_metadata(path_A)
    print("- List of columns of Table A: ")
    print(list(A.columns))
    # Get the Table A id/primary key column name
    print('- Enter Table A primary key column index (ex. 0):')
    pk_A_index = input()
    pk_A = A.columns[int(pk_A_index)]

    # Print Table B column names
    B = em.read_csv_metadata(path_B)
    print("- List of columns of Table B: ")
    print(list(B.columns))
    # Get the Table B id/primary key column name
    print('- Enter Table B primary key column index (ex. 0):')
    pk_B_index = input()
    pk_B = A.columns[int(pk_A_index)]

    # READING TABLES AND SETTING METADATA
    print("\n-------------READING TABLES AND SETTING METADATA-------------\n")

    # Both read csv and set metadata id as ID column
    #A = em.read_csv_metadata(path_A, key=pk_A)
    #B = em.read_csv_metadata(path_B, key=pk_B)
    em.set_key(A, pk_A)
    em.set_key(B, pk_B)

    # Number of tables
    print('- Number of tuples in A: ' + str(len(A)))
    print('- Number of tuples in B: ' + str(len(B)))
    print('- Number of tuples in A X B (i.e the cartesian product): ' +
          str(len(A) * len(B)))

    # Print first 5 tuples of tables
    print(A.head())
    print(B.head())

    # Display the keys of the input tables
    print("- Table A primary key: " + em.get_key(A))
    print("- Table B primary key: " + em.get_key(B))

    # DOWNSAMPLING
    print("\n-------------DOWNSAMPING-------------\n")

    print("- Do you want to use downsampling? (y or n):")
    print("- Table A: " + str(len(A)) + ", Table B: " + str(len(B)))
    print("- NOTE: Recommended if both tables have 100K+ tuples.")
    is_downsample = input()
    if (is_downsample == 'y'):
        print("- Size of the downsampled tables (ex. 200):")
        downsample_size = input()
        # If the tables are large we can downsample the tables like this
        A1, B1 = em.down_sample(A, B, downsample_size, 1, show_progress=False)
        print("- Length of Table A1" + len(A1))
        print("- Length of Table B1" + len(B1))

    # BLOCKING
    print("\n-------------BLOCKING-------------\n")

    print("- Do you want to use blocking? (y or n):")
    is_blocking = input()
    if (is_blocking == 'y'):

        # Check if the 2 tables column names are the same
        if (list(A.columns) == list(B.columns)):
            C_attr_eq = []  # Attr Equ blocker result list
            C_overlap = []  # Overlap blocker result list
            C_blackbox = []  # BlackBox blocker result list

            # Left and right table attribute prefixes
            l_prefix = "ltable_"
            r_prefix = "rtable_"

            print("\n- List of columns: ")
            print(list(A.columns))
            # Labeling output table column selection
            print(
                "\n- Enter the indexes of columns that you want to see in labeling table (0-"
                + str(len(A.columns) - 1) + "):")
            out_attr = []
            for i in range(1, len(A.columns)):
                print("- Finish with empty character(enter+enter) " + str(i))
                add_to_attr = input()
                if (add_to_attr == ''):
                    break
                # Get indexes from user and add columns into out_attr list
                out_attr.append(A.columns[int(add_to_attr)])

            # Print output attributes
            print(out_attr)

            # Loop for adding/combining new blockers
            while (True):
                # Blocker selection
                print(
                    "\n- Do yo want to use Attribute Equivalence[ab] (same), Overlap[ob] (similar) or Blackbox[bb] blocker:"
                )
                blocker_selection = input()

                # ----- Attribute Equivalence Blocker -----
                if (blocker_selection == 'ab'):
                    # Create attribute equivalence blocker
                    ab = em.AttrEquivalenceBlocker()
                    # Counter for indexes
                    attr_eq_counter = 0
                    # Check if Overlap Blocker used before
                    if (C_overlap and not C_overlap[-1].empty):
                        print(
                            "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_attr_eq.append(
                                C_overlap[-1])  # Add last output of ob
                            attr_eq_counter += 1  # For skipping block_table function in first time

                    # Check if BlackBox Blocker used before
                    if (C_blackbox and not C_blackbox[-1].empty):
                        print(
                            "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_attr_eq.append(
                                C_blackbox[-1])  # Add last output of ob
                            attr_eq_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into Attr Equ blocker
                    while (True):
                        # List column names
                        print("\n- List of columns: ")
                        print(list(A.columns))
                        # Get blocking attribute/column
                        print(
                            "\n- Which column (w/ index) to use for equivalence blocking? (ex. 1):"
                        )
                        blocking_col_index = input()
                        blocking_col = A.columns[int(blocking_col_index)]

                        print(
                            "\n- Do you want to add missing values into blocking? (y or n):"
                        )
                        add_missing_val = input()
                        if (add_missing_val == 'y'):
                            add_missing_val = True
                        else:
                            add_missing_val = False

                        # First time using Attr Equ blocker, use A and B
                        if (attr_eq_counter == 0):
                            # Block using selected (blocking_col) attribute on A and B
                            C_attr_eq.append(
                                ab.block_tables(A,
                                                B,
                                                blocking_col,
                                                blocking_col,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                allow_missing=add_missing_val,
                                                n_jobs=-1))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block using selected (blocking_col) attribute on previous (last=-1) candidate set
                            C_attr_eq.append(
                                ab.block_candset(C_attr_eq[-1],
                                                 l_block_attr=blocking_col,
                                                 r_block_attr=blocking_col,
                                                 allow_missing=add_missing_val,
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print(
                            "\n- Attribute Equivalence Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_attr_eq[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        attr_eq_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_attr_eq[-1])))
                        print(
                            "- Add another column into Attribute Equivalence Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        ab_next_operation = input()
                        if (not ab_next_operation.islower()):
                            ab_next_operation = ab_next_operation.lower(
                            )  # Lower case
                        # Continue using Attribute Equivalence Blocker
                        if (ab_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (ab_next_operation == 'r'):
                            del C_attr_eq[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use Attribute Equivalence Blocker (y or n):"
                            )
                            ab_next_operation = input()
                            if (ab_next_operation == 'n'):
                                break
                        # Finish Attribute Equivalence Blocker
                        else:
                            break

                # ----- Overlap Blocker -----
                elif (blocker_selection == 'ob'):
                    # Create attribute equivalence blocker
                    ob = em.OverlapBlocker()
                    # Counter for indexes
                    overlap_counter = 0
                    # Check if Attribute Equivalence Blocker used before
                    if (C_attr_eq and not C_attr_eq[-1].empty):
                        print(
                            "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_overlap.append(
                                C_attr_eq[-1])  # Add last output of ab
                            overlap_counter += 1  # For skipping block_table function in first time

                    # Check if BlackBox Blocker used before
                    if (C_blackbox and not C_blackbox[-1].empty):
                        print(
                            "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_overlap.append(
                                C_blackbox[-1])  # Add last output of ob
                            overlap_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into Overlap blocker
                    while (True):
                        # List column names
                        print("- List of columns: ")
                        print(list(A.columns))
                        # Get blocking attribute/column
                        print(
                            "- Which column (w/ index) to use for overlap blocking? (ex. 1):"
                        )
                        blocking_col_index = input()
                        blocking_col = A.columns[int(blocking_col_index)]

                        print(
                            "\n- Do you want to add missing values into blocking? (y or n):"
                        )
                        add_missing_val = input()
                        if (add_missing_val == 'y'):
                            add_missing_val = True
                        else:
                            add_missing_val = False

                        print("\n- Use words as a token? (y or n):")
                        use_world_level = input()
                        if (use_world_level == 'y'):
                            use_world_level = True
                            q_gram_value = None
                        else:
                            use_world_level = False
                            print(
                                "\n- Q-gram q value (ex. 2 --> JO HN SM IT H):"
                            )
                            q_gram_value = input()
                            q_gram_value = int(q_gram_value)

                        print(
                            "\n- Enter the overlap size (# of tokens that overlap):"
                        )
                        overlap_size = input()
                        overlap_size = int(overlap_size)

                        print(
                            "\n- Do you want to remove (a, an, the) from token set? (y or n):"
                        )
                        use_stop_words = input()
                        if (use_stop_words == 'y'):
                            use_stop_words = True
                        else:
                            use_stop_words = False

                        # First time using Overlap blocker, use A and B
                        if (overlap_counter == 0):
                            # Block using selected (blocking_col) attribute on A and B
                            C_overlap.append(
                                ob.block_tables(A,
                                                B,
                                                blocking_col,
                                                blocking_col,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                rem_stop_words=use_stop_words,
                                                q_val=q_gram_value,
                                                word_level=use_world_level,
                                                overlap_size=overlap_size,
                                                allow_missing=add_missing_val,
                                                n_jobs=-1))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block using selected (blocking_col) attribute on previous (last=-1) candidate set
                            C_overlap.append(
                                ob.block_candset(C_overlap[-1],
                                                 l_overlap_attr=blocking_col,
                                                 r_overlap_attr=blocking_col,
                                                 rem_stop_words=use_stop_words,
                                                 q_val=q_gram_value,
                                                 word_level=use_world_level,
                                                 overlap_size=overlap_size,
                                                 allow_missing=add_missing_val,
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print("\n- Overlap Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_overlap[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        overlap_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_overlap[-1])))
                        print(
                            "- Add another column into Overlap Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        ob_next_operation = input()
                        if (not ob_next_operation.islower()):
                            ob_next_operation = ob_next_operation.lower(
                            )  # Lower case
                        # Continue using Overlap Blocker
                        if (ob_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (ob_next_operation == 'r'):
                            del C_overlap[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use Overlap Blocker (y or n):")
                            ob_next_operation = input()
                            if (ob_next_operation == 'n'):
                                break
                        # Finish Overlap Blocker
                        else:
                            break

                # ----- BlackBox Blocker -----
                elif (blocker_selection == 'bb'):
                    # Create attribute equivalence blocker
                    bb = em.BlackBoxBlocker()
                    # Counter for indexes
                    blackbox_counter = 0
                    # Check if Overlap Blocker used before
                    if (C_attr_eq and not C_attr_eq[-1].empty):
                        print(
                            "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_blackbox.append(
                                C_attr_eq[-1])  # Add last output of ob
                            blackbox_counter += 1  # For skipping block_table function in first time

                    # Check if Overlap Blocker used before
                    if (C_overlap and not C_overlap[-1].empty):
                        print(
                            "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_blackbox.append(
                                C_overlap[-1])  # Add last output of ob
                            blackbox_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into BlackBox blocker
                    while (True):
                        # Set function
                        bb.set_black_box_function(
                            number_10_percent_comparision)

                        # First time using Overlap blocker, use A and B
                        if (overlap_counter == 0):
                            # Block on A and B
                            C_blackbox.append(
                                bb.block_tables(A,
                                                B,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                n_jobs=-1,
                                                show_progress=False))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block on previous (last=-1) candidate set
                            C_blackbox.append(
                                bb.block_candset(C_blackbox[-1],
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print("\n- BlackBox Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_blackbox[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        blackbox_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_blackbox[-1])))
                        print(
                            "- Add another column into BlackBox Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        bb_next_operation = input()
                        if (not bb_next_operation.islower()):
                            bb_next_operation = bb_next_operation.lower(
                            )  # Lower case
                        # Continue using Overlap Blocker
                        if (bb_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (bb_next_operation == 'r'):
                            del C_blackbox[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use BlackBox Blocker (y or n):")
                            bb_next_operation = input()
                            if (bb_next_operation == 'n'):
                                break
                        # Finish BlackBox Blocker
                        else:
                            break

                print("\n- Do you want to add/use another blocker? (y or n):")
                blocker_decision = input()
                if (blocker_decision == 'n'):
                    break

            print(
                "\n- Which blocker output you want to use? (Attr Equ-ab, Overlap-ob, BlackBox-bb, Union-un)"
            )
            blocker_output_selection = input()
            # Attribute Equ
            if (blocker_output_selection == "ab"):
                C = C_attr_eq[-1]
            # Overlap
            elif (blocker_output_selection == "ob"):
                C = C_overlap[-1]
                # Overlap
            elif (blocker_output_selection == "bb"):
                C = C_blackbox[-1]
            # Union of blockers
            elif (blocker_output_selection == "un"):
                # Combine/union blockers candidate sets
                print("\n- TODO: Unions Attr Equ and Overlap only!")
                if (C_attr_eq and C_overlap and not C_attr_eq[-1].empty and
                        not C_overlap[-1].empty):  # Both blocker types used
                    C = em.combine_blocker_outputs_via_union(
                        [C_attr_eq[-1], C_overlap[-1]])
                    print(
                        "\n- Blockers candidate set outputs combined via union."
                    )
                else:  # Error
                    C = []
                    print(
                        "\n- ERROR: Candidate set C is empty! Check blockers' results."
                    )
            # Error
            else:
                C = []
                print(
                    "\n- ERROR: Candidate set C is empty! Check blockers' results."
                )
            print("\n- Length of C: " + str(len(C)))

        else:
            print(
                "\n- 2 Tables column names are different, they must be the same"
            )
            print(list(A.columns))
            print(list(B.columns))

    # SAMPLING&LABELING
    print("\n-------------SAMPLING&LABELING-------------\n")

    print("- Choose sampling size (eg. 450):")
    sampling_size = input()
    while (int(sampling_size) > len(C)):
        print("- Sampling size cannot be bigger than " + str(len(C)))
        sampling_size = input()

    # Sample  candidate set
    S = em.sample_table(C, int(sampling_size))

    print("- New window will pop-up for " + sampling_size + " sized table.")
    print("- If there is a match, change tuple's label value to 1")

    # Label S
    G = em.label_table(S, 'label')

    #DEVELOPMENT AND EVALUATION
    print("\n-------------DEVELOPMENT AND EVALUATION-------------\n")

    # Split S into development set (I) and evaluation set (J)
    IJ = em.split_train_test(G, train_proportion=0.7, random_state=0)
    I = IJ['train']
    J = IJ['test']

    #SELECTING THE BEST MATCHER
    print("\n-------------SELECTING THE BEST MATCHER-------------\n")

    # Create a set of ML-matchers
    dt = em.DTMatcher(name='DecisionTree', random_state=0)
    svm = em.SVMMatcher(name='SVM', random_state=0)
    rf = em.RFMatcher(name='RF', random_state=0)
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    ln = em.LinRegMatcher(name='LinReg')
    nb = em.NBMatcher(name='NaiveBayes')

    print(
        "\n- 6 different ML-matchers created: DL, SVM, RF, LogReg, LinReg, NB")

    print("\n- Creating features...")
    # Generate features
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)

    print("\n- Features list:")
    # List the names of the features generated
    print(feature_table['feature_name'])

    print("\n- Converting the development set to feature vectors...")
    # Convert the I into a set of feature vectors using feature_table
    H = em.extract_feature_vecs(I,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    print("\n- Feature table first rows:")
    # Display first few rows
    print(H.head())

    # Primary key of tables = prefix + pk = l_id, r_id
    ltable_pk = l_prefix + pk_A
    rtable_pk = r_prefix + pk_B

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(H))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        H = em.impute_table(
            H,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean',
            val_all_nans=0.0)
        #print("\n- Feature table first rows:")
        # Display first few rows
        #print(H.head())
        print("- Impute table function used for missing values.")

    print("\n- Selecting the best matcher using cross-validation...")
    # Select the best ML matcher using CV
    result = em.select_matcher(
        matchers=[dt, rf, svm, ln, lg, nb],
        table=H,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        k=5,
        target_attr='label',
        metric_to_select_matcher='f1',
        random_state=0)
    print("\n- Results:")
    print(result['cv_stats'])

    #DEBUGGING THE MATCHER
    print("\n-------------DEBUGGING THE MATCHER-------------\n")

    #  Split feature vectors into train and test
    UV = em.split_train_test(H, train_proportion=0.5)
    U = UV['train']
    V = UV['test']

    # Debug decision tree using GUI
    em.vis_debug_rf(rf,
                    U,
                    V,
                    exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
                    target_attr='label')

    print("\n- Do you want to add another feature?")

    H = em.extract_feature_vecs(I,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(H))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        H = em.impute_table(
            H,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean')
        print("\n- Feature table first rows:")
        # Display first few rows
        print(H.head())

    # Select the best ML matcher using CV
    result = em.select_matcher(
        [dt, rf, svm, ln, lg, nb],
        table=H,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        k=5,
        target_attr='label',
        metric_to_select_matcher='f1',
        random_state=0)

    print("\n- Results:")
    print(result['cv_stats'])

    #EVALUATING THE MATCHING OUTPUT
    print("\n-------------EVALUATING THE MATCHING OUTPUT-------------\n")

    print("\n- Converting the evaluation set to feature vectors...")
    # Convert J into a set of feature vectors using feature table
    L = em.extract_feature_vecs(J,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(L))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        L = em.impute_table(
            L,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean')
        print("\n- Feature table first rows:")
        # Display first few rows
        print(L.head())

    print("\n- Training the selected matcher...")
    # Train using feature vectors from I
    rf.fit(table=H,
           exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
           target_attr='label')

    print("\n- Predicting the matches...")
    # Predict on L
    predictions = rf.predict(
        table=L,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        append=True,
        target_attr='predicted',
        inplace=False)

    print("\n- Evaluating the prediction...")
    # Evaluate the predictions
    eval_result = em.eval_matches(predictions, 'label', 'predicted')
    print(em.print_eval_summary(eval_result))

    print("\n- Time elapsed:")
    print(datetime.now() - startTime)

    print("\n-------------END-------------\n")