Ejemplo n.º 1
0
def main():
    A = em.read_csv_metadata('../Data/A_imdb.csv', key='id')
    B = em.read_csv_metadata('../Data/B_tmdb.csv', key='id')
    ab = em.AttrEquivalenceBlocker()
    shared_attributes = ['title', 'directors', 'release_year', 'languages']
    C = ab.block_tables(A,
                        B,
                        'directors',
                        'directors',
                        l_output_attrs=shared_attributes,
                        r_output_attrs=shared_attributes)
    # Take a sample of 10 pairs
    S = em.sample_table(C, 100)
    print(S)
    G = em.label_table(S, label_column_name='gold_labels')
    train_test = em.split_train_test(G, train_proportion=0.5)
    train, test = train_test['train'], train_test['test']
    # Get feature for matching
    match_f = em.get_features_for_matching(A, B)
    H = em.extract_feature_vecs(train,
                                attrs_before=['ltable_title', 'rtable_title'],
                                feature_table=match_f,
                                attrs_after=['gold_labels'])
    H.fillna(value=0, inplace=True)
    print(H)
    # Specifying Matchers and Performing Matching.
    dt = em.DTMatcher(max_depth=5)  # A decision tree matcher.
    # Train the matcher
    dt.fit(table=H,
           exclude_attrs=[
               '_id', 'ltable_id', 'rtable_id', 'ltable_title', 'rtable_title',
               'gold_labels'
           ],
           target_attr='gold_labels')
    # Predict
    F = em.extract_feature_vecs(test,
                                attrs_before=['ltable_title', 'rtable_title'],
                                feature_table=match_f,
                                attrs_after=['gold_labels'])
    F.fillna(value=0, inplace=True)
    print(F)
    pred_table = dt.predict(table=F,
                            exclude_attrs=[
                                '_id', 'ltable_id', 'rtable_id',
                                'ltable_title', 'rtable_title', 'gold_labels'
                            ],
                            target_attr='predicted_labels',
                            return_probs=True,
                            probs_attr='proba',
                            append=True,
                            inplace=True)
    print(pred_table)
    eval_summary = em.eval_matches(pred_table, 'gold_labels',
                                   'predicted_labels')
    em.print_eval_summary(eval_summary)
Ejemplo n.º 2
0
def compute_accuracy_J(matcher, return_probs_arg, H, J):
    # Train using feature vectors from I
    matcher.fit(table=H,
                exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],
                target_attr='label')

    # Convert J into a set of feature vectors using F
    L = em.extract_feature_vecs(J,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    # Impute L
    L = em.impute_table(
        L,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],
        strategy='mean')

    # Predict on L
    predictions = matcher.predict(
        table=L,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],
        append=True,
        target_attr='predicted',
        inplace=False,
        return_probs=return_probs_arg,
        probs_attr='proba')
    # print(predictions.head())

    # Evaluate the predictions
    eval_result = em.eval_matches(predictions, 'label', 'predicted')
    em.print_eval_summary(eval_result)
Ejemplo n.º 3
0
    def predict(self, dataset, impute_value=0):
        dataset = dataset.copy()
        with io.capture_output() as captured:
            dataset['id'] = dataset['left_id'] = dataset['right_id'] = np.arange(dataset.shape[0])
            leftDF = dataset[self.lcolumns].copy()
            leftDF.columns = self.columns
            rightDF = dataset[self.rcolumns].copy()
            rightDF.columns = self.columns

            em.set_key(dataset, 'id')
            em.set_key(leftDF, 'id')
            em.set_key(rightDF, 'id')
            em.set_ltable(dataset, leftDF)
            em.set_rtable(dataset, rightDF)
            em.set_fk_ltable(dataset, 'left_id')
            em.set_fk_rtable(dataset, 'right_id')

            self.exctracted_features = em.extract_feature_vecs(dataset, feature_table=self.feature_table)
            self.exctracted_features = self.exctracted_features.fillna(impute_value)
            exclude_tmp = list(
                set(self.exclude_attrs) - (set(self.exclude_attrs) - set(self.exctracted_features.columns)))
            self.predictions = self.model.predict(table=self.exctracted_features, exclude_attrs=exclude_tmp,
                                                  return_probs=True,
                                                  target_attr='pred', probs_attr='match_score', append=True)
        del dataset
        del captured
        gc.collect()
        return self.predictions['match_score'].values
Ejemplo n.º 4
0
def extract_features_auto(ltable_df, rtable_df, candset_df):
    feature_list = em.get_features_for_matching(ltable_df,rtable_df,validate_inferred_attr_types=False)
    #Remove all features based on id - they are often useless
    feature_list = feature_list[feature_list.left_attribute !='id']

    print("\n\nExtracting the full set of features:")
    candset_features_df = em.extract_feature_vecs(candset_df,feature_table=feature_list,attrs_after='gold',show_progress=True)
    candset_features_df.fillna(value=0, inplace=True)

    return candset_features_df
Ejemplo n.º 5
0
def get_feature_vectors(C, feature_table, attrs_before=None, attrs_after=None):

    H = em.extract_feature_vecs(C,
                                feature_table=feature_table,
                                attrs_before=attrs_before,
                                attrs_after=attrs_after,
                                show_progress=True,
                                n_jobs=-1)
    # Set NaNs to 0
    H.fillna(0, inplace=True)

    return H
Ejemplo n.º 6
0
def predict_matching_tuples(A, B, C, G):
    # Split G into I and J for CV
    IJ = em.split_train_test(G, train_proportion=0.5, random_state=0)
    I = IJ['train']
    # Generate features set F
    F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
    # Convert G to a set of feature vectors using F
    H = em.extract_feature_vecs(I,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    excluded_attributes = ['_id', 'l_id', 'r_id', 'label']
    # Fill in missing values with column's average
    H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean')
    # Create and train a logistic regression - the best matcher from stage3.
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    # Convert C into a set of features using F
    L = em.extract_feature_vecs(C, feature_table=F, show_progress=False)
    # Fill in missing values with column's average
    L = em.impute_table(L,
                        exclude_attrs=['_id', 'l_id', 'r_id'],
                        strategy='mean')
    # Predict on L with trained matcher
    predictions = lg.predict(table=L,
                             exclude_attrs=['_id', 'l_id', 'r_id'],
                             append=True,
                             target_attr='predicted',
                             inplace=False,
                             return_probs=False,
                             probs_attr='proba')
    # Extract the matched pairs' ids
    matched_pairs = predictions[predictions.predicted == 1]
    matched_ids = matched_pairs[['l_id', 'r_id']]
    # Save matched_pairs to file so we don't have to train and predict each time the code is executed
    matched_ids.to_csv(FOLDER + 'predictedMatchedIDs.csv', index=False)
Ejemplo n.º 7
0
def extract_features(ltable_df, rtable_df, candset_df):
    tokenizers = em.get_tokenizers_for_matching()
    sim_functions = em.get_sim_funs_for_matching()
    left_attr_types = em.get_attr_types(ltable_df)
    right_attr_types = em.get_attr_types(rtable_df)
    correspondences = em.get_attr_corres(ltable_df, rtable_df)

    feature_dict_list = []
    attribute_type_rank = {'boolean':1, 'numeric':2, 'str_eq_1w':3, 'str_bt_1w_5w':4, 'str_bt_5w_10w':5, 'str_gt_10w':6, 'un_determined':7}
    for c in correspondences['corres']:
        if left_attr_types[c[0]] != right_attr_types[c[1]]:
            if attribute_type_rank[left_attr_types[c[0]]] < attribute_type_rank[right_attr_types[c[1]]]:
                left_attr_types[c[0]] = right_attr_types[c[1]]
            else:
                right_attr_types[c[1]] = left_attr_types[c[0]]

    feature_records = get_features(ltable_df,rtable_df,left_attr_types, right_attr_types, correspondences, tokenizers, sim_functions)
    #Remove all features based on id - they are often useless
    feature_records = feature_records[feature_records.left_attribute !='id']
    feature_records.reset_index(inplace=True,drop=True)

    distance_functions = ["lev_dist", "rdf"]
    non_normalized_functions = ["aff", "sw", "swn", "nmw"]
    keep_features = [True]*feature_records.shape[0]
    for i in range(feature_records.shape[0]):
        feature = feature_records.loc[i,"feature_name"]
        for func in distance_functions + non_normalized_functions:
            if func in feature:
                keep_features[i] = False
    feature_records = feature_records.loc[keep_features,:]

    print("\n\nExtracting the full set of features:")
    candset_features_df = em.extract_feature_vecs(candset_df,feature_table=feature_records,attrs_after='gold',show_progress=True,n_jobs=-1)
    candset_features_df.fillna(value=0, inplace=True)

    return candset_features_df
Ejemplo n.º 8
0
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')
print('Length is ' + str(len(G)))


rf = em.RFMatcher(name='RF', random_state=0)


feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

# Remove the id comparisons
feature_table = feature_table.drop([0, 1, 2, 3], axis=0)

H = em.extract_feature_vecs(G, 
                            feature_table=feature_table, 
                            attrs_after='gold',
                            show_progress=False)

def RS(proba, batch_size):
    return np.random.choice(range(proba.shape[0]),batch_size,replace=False)

def LC(proba, batch_size):
    return np.argsort(np.max(proba,axis=1))[:batch_size]

def BT(proba, batch_size):
    sorted_proba = np.sort(proba,axis=1)
    return np.argsort(sorted_proba[:,-1]-sorted_proba[:,-2])[:batch_size]

Features = em.extract_feature_vecs(G, feature_table=feature_table,
                            attrs_after='gold', show_progress=False)
Ejemplo n.º 9
0
import logging
logging.basicConfig(level=logging.INFO)
print("Mem. usage before reading:{0} (GB)".format(
    psutil.virtual_memory().used / 1e9))
A = em.read_csv_metadata('../datasets/sample_msd_100k.csv', key='id')
B = em.read_csv_metadata('../datasets/sample_msd_100k.csv', key='id')
C = em.read_csv_metadata('../datasets/candset_msd_300k.csv',
                         key='_id',
                         ltable=A,
                         rtable=B,
                         fk_ltable='l_id',
                         fk_rtable='r_id')
print("Mem. usage after reading:{0} (GB)".format(psutil.virtual_memory().used /
                                                 1e9))
len(C)

memUsageBefore = psutil.virtual_memory().used / 1e9
timeBefore = time.time()
feature_table = em.get_features_for_matching(A, B)

memUsageBefore = psutil.virtual_memory().used / 1e9
timeBefore = time.time()
feature_vecs = em.extract_feature_vecs(C, feature_table=feature_table)
timeAfter = time.time()
memUsageAfter = psutil.virtual_memory().used / 1e9

print(
    'Mem.usage (after reading): {0} (GB), Mem.usage (after extract featvecs): {1} (GB), diff: {2} (GB)'
    .format(memUsageBefore, memUsageAfter, memUsageAfter - memUsageBefore))
print('Time. diff: {0} (secs)'.format(timeAfter - timeBefore))
Ejemplo n.º 10
0
def automatic_feature_gen(candidate_table, feature_cols, id_names,
                          id_names_phrase):
    '''
    NB!
    The automatic function creates pairwise features. Consequently, it will convert
    internally the colnames in lhs and rhs portions of feature cols to the SAME name.
    It does this by trimming the `id_names_phrase` portion (suffix or prefix) from each column name
    It assumes that the id names are of the form id_{id_names_phrase} e.g. id_amzn

    Replaces Nans in candidate table with empty strings

    Takes in a single DataFrame object (lhs_table and rhs_table concatenated) and
    splits it into two tables then generates features on each of the sub tables.

    Inputs:
            candidate_table: single Pandas DataFrame (typically output of blocking_algorithms.py functions)

    Outputs:
    '''

    em.del_catalog()
    candidate_table = candidate_table.reset_index()

    lhs_table = candidate_table.loc[:, feature_cols[0] + [id_names[0]]]
    rhs_table = candidate_table.loc[:, feature_cols[1] + [id_names[1]]]

    lhs_colnames = []
    for colname in lhs_table:
        if colname != id_names[0]:
            lhs_colnames.append(re.sub(id_names_phrase[0], "", colname))
        else:
            lhs_colnames.append(colname)
    rhs_colnames = []
    for colname in rhs_table:
        if colname != id_names[1]:
            rhs_colnames.append(re.sub(id_names_phrase[1], "", colname))
        else:
            rhs_colnames.append(colname)

    lhs_table.columns = lhs_colnames
    rhs_table.columns = rhs_colnames
    # To circumvent the same product ID coming up again (due to it being in multiple candidate comparisons)
    lhs_table["index_num_lhs"] = np.arange(lhs_table.shape[0])
    rhs_table["index_num_rhs"] = np.arange(rhs_table.shape[0])

    em.set_key(lhs_table, "index_num_lhs")  # changed from id_names
    em.set_key(rhs_table, "index_num_rhs")
    # Generate List Of Features
    matching_features = em.get_features_for_matching(
        lhs_table.drop(id_names[0], axis=1),
        rhs_table.drop(id_names[1], axis=1),
        validate_inferred_attr_types=False)
    # Extract feature vectors and save as a  DF
    # Set primary keys and foreign keys for candidate table
    candidate_table["index"] = np.arange(candidate_table.shape[0])
    # Add foreign keys to candidate table
    candidate_table["index_num_lhs"] = np.arange(lhs_table.shape[0])
    candidate_table["index_num_rhs"] = np.arange(rhs_table.shape[0])

    em.set_key(candidate_table, "index")
    em.set_fk_ltable(candidate_table, "index_num_lhs")
    em.set_fk_rtable(candidate_table, "index_num_rhs")
    em.set_ltable(candidate_table, lhs_table)
    em.set_rtable(candidate_table, rhs_table)

    matching_features_df = em.extract_feature_vecs(
        candidate_table, feature_table=matching_features, show_progress=False)

    matching_features_df = em.impute_table(
        matching_features_df,
        exclude_attrs=['index', "index_num_lhs", "index_num_rhs"],
        strategy='mean')
    # add back the amzn and google ids
    matching_features_df["id_amzn"] = candidate_table.id_amzn
    matching_features_df["id_g"] = candidate_table.id_g

    matching_features_df = matching_features_df.fillna(value=0)

    # print(matching_features_df.describe())
    # print(f"Number na {matching_features_df.isna().apply(sum)}")
    # print(f"Number null {matching_features_df.isnull().apply(sum)}")
    return matching_features_df
Ejemplo n.º 11
0
match_f.drop([13, 14, 15, 16], inplace=True)

# In[114]:

# List the names of the features generated
match_f['feature_name']

# Converting the development set to feature vectors
# ------------------

# In[116]:

# Convert the I into a set of feature vectors using F

H = em.extract_feature_vecs(I,
                            feature_table=match_f,
                            attrs_after=['gold_labels'])

# In[117]:

## Display first three rows
H.head(3)

# Selecting the best matcher using cross-validation
# ------------------

# Now, we select the best matcher using k-fold cross-validation.
# For the purposes of this guide, we use ten fold cross validation and use 'precision' and 'recall' metric to select the best matcher

# In[120]:
def workflow(path_A, path_B, path_labeled):

    # Load csv files as dataframes and set the key attribute in the dataframe
    A = em.read_csv_metadata(path_A, key='ID')
    B = em.read_csv_metadata(path_B, key='ID')

    # Run attribute equivalence blocker on brand
    ab = em.AttrEquivalenceBlocker()
    C1 = ab.block_tables(A,
                         B,
                         'Brand',
                         'Brand',
                         l_output_attrs=[
                             'Name', 'Price', 'Brand', 'Screen Size', 'RAM',
                             'Hard Drive Capacity', 'Processor Type',
                             'Processor Speed', 'Operating System',
                             'Clean Name'
                         ],
                         r_output_attrs=[
                             'Name', 'Price', 'Brand', 'Screen Size', 'RAM',
                             'Hard Drive Capacity', 'Processor Type',
                             'Processor Speed', 'Operating System',
                             'Clean Name'
                         ])

    # Get features for rule based blocking
    block_f = em.get_features_for_blocking(A,
                                           B,
                                           validate_inferred_attr_types=False)

    # Run rule based blocker with rule for jaccard score on Clean Name column
    rb = em.RuleBasedBlocker()
    rb.add_rule(
        ['Clean_Name_Clean_Name_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.2'],
        block_f)
    C2 = rb.block_candset(C1)

    # Run black box blocker to compare screen size, ram, and hard drive capacity
    bb_screen = em.BlackBoxBlocker()
    bb_screen.set_black_box_function((screen_ram_hd_equal))
    C = bb_screen.block_candset(C2)

    # Load the labeled data
    L = em.read_csv_metadata(path_labeled,
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='ltable_ID',
                             fk_rtable='rtable_ID')

    # Generate features
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)
    feature_subset = feature_table.iloc[np.r_[4:10, 40:len(feature_table)], :]
    em.add_blackbox_feature(feature_subset, 'refurbished', refurbished)

    # Extract feature vectors
    feature_vectors_dev = em.extract_feature_vecs(L,
                                                  feature_table=feature_subset,
                                                  attrs_after='gold')

    # Impute feature vectors with the mean of the column values.
    feature_vectors_dev = em.impute_table(
        feature_vectors_dev,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'],
        strategy='mean')

    # Train using feature vectors from the labeled data
    matcher = em.RFMatcher(name='RF')
    matcher.fit(table=feature_vectors_dev,
                exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'],
                target_attr='gold')

    # Extract feature vectors for the rest of the data
    feature_vectors = em.extract_feature_vecs(C, feature_table=feature_subset)

    # Impute feature vectors with the mean of the column values.
    feature_vectors = em.impute_table(
        feature_vectors,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
        strategy='mean')

    # Make predictions for the whole data set
    predictions = matcher.predict(
        table=feature_vectors,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
        append=True,
        target_attr='predicted',
        inplace=False)
    predictions = predictions.loc[:, [
        '_id', 'ltable_ID', 'rtable_ID', 'predicted'
    ]]

    return predictions[predictions['predicted'] == 1]
Ejemplo n.º 13
0
IJ = em.split_train_test(G, train_proportion=0.6, random_state=0);
I = IJ['train'];
J = IJ['test'];

# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0);
rf = em.RFMatcher(name='Random Forest', random_state=0);
svm = em.SVMMatcher(name='SVM', random_state=0);
nb = em.NBMatcher(name='Naive Bayes');
lg = em.LogRegMatcher(name='Logistic Reg', random_state=0);
ln = em.LinRegMatcher(name='Linear Reg');

F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False);

H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_labels',
                            show_progress=False)

H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold_labels'],
                strategy='mean');

# print(any(pd.notnull(H)));

result = em.select_matcher([dt, rf, svm, nb, lg, ln], table=H, 
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold_labels'],
        k=5,
        target_attr='gold_labels', metric_to_select_matcher='f1', random_state=0);

print(result['cv_stats']);
Ejemplo n.º 14
0
# Initialising all ML algos
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')

# Generating features for training
features = em.get_features_for_matching(A,
                                        B,
                                        validate_inferred_attr_types=False)

# Extracting feature vectors to train and create model
H = em.extract_feature_vecs(train_set,
                            feature_table=features,
                            attrs_after='label',
                            show_progress=False)

H.head()

# Checking if any value is null
any(pd.notnull(H))

# We found null values. Hence, used impute_table to fill up the other values with strategy - mean.
H = em.impute_table(H,
                    exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                    strategy='mean')

# Running select matcher step to run all possible algos and pick the best ML
result = em.select_matcher(
    [dt, rf, svm, ln, lg],
Ejemplo n.º 15
0
                             ltable=metacriticData,
                             rtable=wikiData,
                             fk_ltable="ltable_ID",
                             fk_rtable="rtable_ID")
    print("Reading I and J from files")
print(len(I))
print(len(J))

# Generate a set of features
F = em.get_features_for_matching(metacriticData,
                                 wikiData,
                                 validate_inferred_attr_types=False)

# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I,
                            feature_table=F,
                            attrs_after='label',
                            show_progress=False)

# create learners
random_state = 0

dt = em.DTMatcher(name='DecisionTree', random_state=random_state)
rf = em.RFMatcher(name='RF', random_state=random_state)
svm = em.SVMMatcher(name='SVM', random_state=random_state)
ln = em.LinRegMatcher(name='LinReg')
lg = em.LogRegMatcher(name='LogReg', random_state=random_state)
nb = em.NBMatcher(name='NaiveBayes')

# Impute feature vectors with the mean of the column values.
H = em.impute_table(H,
                    exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],
# 
# First, we obtain all the features we could use for matching. Ft is our feature table

# In[260]:


Ft = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)


# Use the system to generate feature vectors from set I. This is called set H

# In[261]:


H = em.extract_feature_vecs(I, 
                            feature_table=Ft, 
                            attrs_after='label',
                            show_progress=False)


# Perform matches and display results below (after performing cross-validation)

# In[262]:


H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_Id', 'rtable_Id', 'label'],
                strategy='mean')


# In[161]:
Ejemplo n.º 17
0
# prepare classifiers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', kernel='linear', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

# need A and B csv files
feature_table = em.get_features_for_matching(
    A, B, validate_inferred_attr_types=False)

print(feature_table.feature_name)

H = em.extract_feature_vecs(I,
                            feature_table=feature_table,
                            attrs_after='label',
                            show_progress=False)
H.fillna(value=0, inplace=True)

# select best matcher
# precision
result = em.select_matcher(
    [dt, svm, rf, lg, ln, nb],
    table=H,
    exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],
    k=5,
    target_attr='label',
    metric_to_select_matcher='precision',
    random_state=0)

# recall
Ejemplo n.º 18
0
]

l_attr_types = em.get_attr_types(kaggle_data)
r_attr_types = em.get_attr_types(imdb_data)

tok = em.get_tokenizers_for_matching()
sim = em.get_sim_funs_for_matching()

F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim)


# Given the set of desired features **F**, we can now calculate the feature values for our training data and also impute the missing values in our data. In this case, we choose to replace the missing values with the mean of the column.

# In[28]:

train_features = em.extract_feature_vecs(train_data, feature_table=F, attrs_after='label', show_progress=False) 
train_features = em.impute_table(train_features,  exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean')


# Using the calculated features, we can evaluate the performance of different machine learning algorithms and select the best one for our matching task.

# In[29]:

result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=train_features, 
                           exclude_attrs=['_id', 'l_id', 'r_id', 'label'], k=5,
                           target_attr='label', metric='f1', random_state=0)
result['cv_stats']


# We can observe based on the reported accuracy of different techniques that the "random forest (RF)" algorithm achieves the best performance. Thus, it is best to use this technique for the matching.
Ejemplo n.º 19
0
                                 B.iloc[:, 1:8],
                                 validate_inferred_attr_types=False)

# Create a feature on the value of (price + rating), then compute Levenshtein similarity
sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """lev_sim(wspace(float(ltuple['price']) + float(ltuple['rating'])), 
                            wspace(float(rtuple['price']) + float(rtuple['rating'])))"""
feature = em.get_feature_fn(feature_string, sim, tok)

# Add feature to F
em.add_feature(F, 'lev_ws_price+rating', feature)

# Convert the sample set into a set of feature vectors using F
H = em.extract_feature_vecs(G,
                            feature_table=F,
                            attrs_after='labe',
                            show_progress=False)

# impute missing values
H = em.impute_table(H,
                    exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'labe'],
                    strategy='mean')

# Fit a Naive Bayes matcher
matcher = em.NBMatcher(name='NaiveBayes')
matcher.fit(table=H,
            exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'labe'],
            target_attr='labe')

# Apply matcher to the whole dataset
Ht = em.extract_feature_vecs(C, feature_table=F, show_progress=False)
                         fk_rtable='rtable_id')

print('Number of tuples in Labelled: ' + str(len(G)))

feature_table = em.get_features_for_matching(
    A, B, validate_inferred_attr_types=False)

# Select the attrs. to be included in the feature vector table
attrs_from_table = [
    'ltable_name', 'ltable_addr', 'ltable_city', 'ltable_phone', 'rtable_name',
    'rtable_addr', 'rtable_city', 'rtable_phone'
]

H = em.extract_feature_vecs(G,
                            feature_table=feature_table,
                            attrs_before=attrs_from_table,
                            attrs_after='gold',
                            show_progress=False)

rf = em.RFMatcher()

attrs_to_be_excluded = []
attrs_to_be_excluded.extend(['_id', 'ltable_id', 'rtable_id', 'gold'])
attrs_to_be_excluded.extend(attrs_from_table)

rf.fit(table=H, exclude_attrs=attrs_to_be_excluded, target_attr='gold')

L = em.extract_feature_vecs(C,
                            feature_table=feature_table,
                            attrs_before=attrs_from_table,
                            show_progress=False,
Ejemplo n.º 21
0
def main():
    # Read in data files
    A = em.read_csv_metadata(FOLDER + 'A.csv', key='id')  # imdb data
    B = em.read_csv_metadata(FOLDER + 'B.csv', key='id')  # tmdb data
    G = em.read_csv_metadata(FOLDER + 'G.csv',
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='l_id',
                             fk_rtable='r_id')  # labeled data
    # Split G into I and J for CV
    IJ = em.split_train_test(G, train_proportion=0.5, random_state=0)
    I = IJ['train']
    J = IJ['test']
    # Save I and J to files
    I.to_csv(FOLDER + 'I.csv', index=False)
    J.to_csv(FOLDER + 'J.csv', index=False)
    # Generate features set F
    F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
    #print(F.feature_name)
    #print(type(F))
    # Convert I to a set of feature vectors using F
    H = em.extract_feature_vecs(I,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    #print(H.head)
    # Check of missing values
    #print(any(pd.notnull(H)))
    excluded_attributes = ['_id', 'l_id', 'r_id', 'label']
    # Fill in missing values with column's average
    H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean')
    # Create a set of matchers
    dt = em.DTMatcher(name='DecisionTree', random_state=0)
    svm = em.SVMMatcher(name='SVM', random_state=0)
    rf = em.RFMatcher(name='RF', random_state=0)
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    ln = em.LinRegMatcher(name='LinReg')
    nb = em.NBMatcher(name='NaiveBayes')
    # Selecting best matcher with CV using F1-score as criteria
    CV_result = em.select_matcher([dt, rf, svm, ln, lg, nb],
                                  table=H,
                                  exclude_attrs=excluded_attributes,
                                  k=10,
                                  target_attr='label',
                                  metric_to_select_matcher='f1',
                                  random_state=0)
    print(CV_result['cv_stats'])  # RF is the best matcher
    # Train matchers on H
    dt.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    rf.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    svm.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    ln.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    nb.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    # Convert J into a set of features using F
    L = em.extract_feature_vecs(J,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    # Fill in missing values with column's average
    L = em.impute_table(L, exclude_attrs=excluded_attributes, strategy='mean')
    # Predict on L with trained matchers
    predictions_dt = dt.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_rf = rf.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_svm = svm.predict(table=L,
                                  exclude_attrs=excluded_attributes,
                                  append=True,
                                  target_attr='predicted',
                                  inplace=False,
                                  return_probs=False,
                                  probs_attr='proba')
    predictions_lg = lg.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_ln = ln.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_nb = nb.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    # Evaluate predictions
    dt_eval = em.eval_matches(predictions_dt, 'label', 'predicted')
    em.print_eval_summary(dt_eval)
    rf_eval = em.eval_matches(predictions_rf, 'label', 'predicted')
    em.print_eval_summary(rf_eval)
    svm_eval = em.eval_matches(predictions_svm, 'label', 'predicted')
    em.print_eval_summary(svm_eval)
    lg_eval = em.eval_matches(predictions_lg, 'label', 'predicted')
    em.print_eval_summary(lg_eval)
    ln_eval = em.eval_matches(predictions_ln, 'label', 'predicted')
    em.print_eval_summary(ln_eval)
    nb_eval = em.eval_matches(predictions_nb, 'label', 'predicted')
    em.print_eval_summary(nb_eval)
Ejemplo n.º 22
0
def main():
    # Read in data files
    A = em.read_csv_metadata(FOLDER + 'A.csv', key='id')  # imdb data
    B = em.read_csv_metadata(FOLDER + 'B.csv', key='id')  # tmdb data
    G = em.read_csv_metadata(FOLDER + 'G.csv',
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='l_id',
                             fk_rtable='r_id')  # labeled data
    # Split G into I and J for CV
    IJ = em.split_train_test(G, train_proportion=0.5, random_state=0)
    I = IJ['train']
    # Generate features set F
    F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
    # Convert I to a set of feature vectors using F
    H = em.extract_feature_vecs(I,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    excluded_attributes = ['_id', 'l_id', 'r_id', 'label']
    # Fill in missing values with column's average
    H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean')
    # Create and train a logistic regression - the best matcher from stage3.
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    # Read in the candidate tuple pairs.
    C = em.read_csv_metadata(FOLDER + 'C.csv',
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='l_id',
                             fk_rtable='r_id')  # labeled data
    # Convert C into a set of features using F
    L = em.extract_feature_vecs(C, feature_table=F, show_progress=False)
    # Fill in missing values with column's average
    L = em.impute_table(L,
                        exclude_attrs=['_id', 'l_id', 'r_id'],
                        strategy='mean')
    # Predict on L with trained matcher
    predictions = lg.predict(table=L,
                             exclude_attrs=['_id', 'l_id', 'r_id'],
                             append=True,
                             target_attr='predicted',
                             inplace=False,
                             return_probs=False,
                             probs_attr='proba')
    # Output the merged table (Basically what matches).
    # We start with rows from A that matches.
    # We then merge value from B into A.
    matched_pairs = predictions[predictions.predicted == 1]
    left_ids = matched_pairs['l_id'].to_frame()
    left_ids.columns = ['id']
    merged = pd.merge(A, left_ids, on='id')
    merged.set_index('id', inplace=True)
    B.set_index('id', inplace=True)
    black_list = {'a872', 'a987'}
    for pair in matched_pairs.itertuples():
        aid = pair.l_id
        bid = pair.r_id
        if (aid in black_list):
            continue
        # Title: keep title from A, if title from B is not an exact matched
        # from A, append B’s title to the alternative title field if B’s title
        # is not already in A’s alternative title.
        m_title = merged.loc[aid, 'title']
        a_title = merged.loc[aid, 'title']
        b_title = B.loc[bid, 'title']
        if (b_title != a_title):
            if pd.isnull(merged.loc[aid, 'alternative_titles']):
                merged.loc[aid, 'alternative_titles'] = b_title
            else:
                alt = set(merged.loc[aid, 'alternative_titles'].split(';'))
                if (b_title not in alt):
                    merged.loc[aid, 'alternative_titles'] += ';' + b_title
        for col in [
                'directors', 'writers', 'cast', 'genres', 'keywords',
                'languages', 'production_companies', 'production_countries'
        ]:
            merged.loc[aid, col] = merge_cell(merged.loc[aid, col], B.loc[bid,
                                                                          col])
        # Content rating: keep A
        # Release year: keep A
        # Opening_weekend_revenue: keep A
        # Run time
        m_runtime = int(
            (merged.loc[aid, 'run_time'] + B.loc[bid, 'run_time']) / 2)
        merged.loc[aid, 'run_time'] = m_runtime
        # Budget and Revenue
        for col in ['budget', 'revenue']:
            merged.loc[aid, col] = merge_money(merged.loc[aid, col],
                                               B.loc[bid, col])
        # Rating: take the average after converting B rating to scale 10.
        m_rating = (merged.loc[aid, 'rating'] + 0.1 * B.loc[bid, 'rating']) / 2
        merged.loc[aid, 'rating'] = m_rating
    merged.to_csv(FOLDER + 'E.csv', index=True)
def run_magellan(train_set,
                 valid_set,
                 test_set,
                 feature_combinations,
                 classifiers,
                 experiment_name,
                 write_test_set_for_inspection=False):
    train_path = os.path.dirname(train_set)
    train_file = os.path.basename(train_set)
    test_path = os.path.dirname(test_set)
    test_file = os.path.basename(test_set)
    report_train_name = train_file.replace('.csv', '')
    report_test_name = test_file.replace('.csv', '')

    train_set_left = train_file.replace('pairs', 'left')
    train_set_right = train_file.replace('pairs', 'right')

    test_set_left = test_file.replace('pairs', 'left')
    test_set_right = test_file.replace('pairs', 'right')

    os.makedirs(os.path.dirname(
        '../../../reports/magellan/{}/'.format(experiment_name)),
                exist_ok=True)

    try:
        os.remove('../../../reports/magellan/{}/{}_{}.csv'.format(
            experiment_name, report_train_name, report_test_name))
    except OSError:
        pass

    with open(
            '../../../reports/magellan/{}/{}_{}.csv'.format(
                experiment_name, report_train_name, report_test_name),
            "w") as f:
        f.write(
            'feature#####model#####mean_train_score#####std_train_score#####mean_valid_score#####std_valid_score#####precision_test#####recall_test#####f1_test#####best_params#####train_time#####prediction_time#####feature_importance#####experiment_name#####train_set#####test_set\n'
        )

    for run in range(1, 4):
        for feature_combination in feature_combinations:

            A_t = em.read_csv_metadata(train_path + '/' + train_set_left,
                                       key='mag_id')
            B_t = em.read_csv_metadata(train_path + '/' + train_set_right,
                                       key='mag_id')
            # Load the pre-labeled data
            S_t = em.read_csv_metadata(train_set,
                                       key='_id',
                                       ltable=A_t,
                                       rtable=B_t,
                                       fk_ltable='ltable_mag_id',
                                       fk_rtable='rtable_mag_id')

            A_gs = em.read_csv_metadata(test_path + '/' + test_set_left,
                                        key='mag_id')
            B_gs = em.read_csv_metadata(test_path + '/' + test_set_right,
                                        key='mag_id')
            # Load the pre-labeled data
            S_gs = em.read_csv_metadata(test_set,
                                        key='_id',
                                        ltable=A_gs,
                                        rtable=B_gs,
                                        fk_ltable='ltable_mag_id',
                                        fk_rtable='rtable_mag_id')

            A_t.fillna('', inplace=True)
            A_gs.fillna('', inplace=True)

            B_t.fillna('', inplace=True)
            B_gs.fillna('', inplace=True)

            S_t.fillna('', inplace=True)
            S_gs.fillna('', inplace=True)

            ## DIRTY FIX, CLEAN UP!
            if 'name' in A_t.columns:
                A_t["price"] = A_t["price"].replace(r'^\s*$',
                                                    np.nan,
                                                    regex=True)
                A_t["price"] = A_t["price"].astype('float64')
                A_gs["price"] = A_gs["price"].replace(r'^\s*$',
                                                      np.nan,
                                                      regex=True)
                A_gs["price"] = A_gs["price"].astype('float64')
                B_t["price"] = B_t["price"].replace(r'^\s*$',
                                                    np.nan,
                                                    regex=True)
                B_t["price"] = B_t["price"].astype('float64')
                B_gs["price"] = B_gs["price"].replace(r'^\s*$',
                                                      np.nan,
                                                      regex=True)
                B_gs["price"] = B_gs["price"].astype('float64')

                S_t["ltable_price"] = S_t["ltable_price"].replace(r'^\s*$',
                                                                  np.nan,
                                                                  regex=True)
                S_t["ltable_price"] = S_t["ltable_price"].astype('float64')
                S_t["rtable_price"] = S_t["rtable_price"].replace(r'^\s*$',
                                                                  np.nan,
                                                                  regex=True)
                S_t["rtable_price"] = S_t["rtable_price"].astype('float64')

                S_gs["ltable_price"] = S_gs["ltable_price"].replace(r'^\s*$',
                                                                    np.nan,
                                                                    regex=True)
                S_gs["ltable_price"] = S_gs["ltable_price"].astype('float64')
                S_gs["rtable_price"] = S_gs["rtable_price"].replace(r'^\s*$',
                                                                    np.nan,
                                                                    regex=True)
                S_gs["rtable_price"] = S_gs["rtable_price"].astype('float64')

            atypes1 = em.get_attr_types(A_t)
            atypes2 = em.get_attr_types(B_t)

            match_c = em.get_attr_corres(A_t, B_t)

            match_c['corres'] = []

            # select attributes to compare
            for feature in feature_combination:
                match_c['corres'].append((feature, feature))

            tok = em.get_tokenizers_for_matching()
            sim = em.get_sim_funs_for_matching()

            F_t = em.get_features(A_t, B_t, atypes1, atypes2, match_c, tok,
                                  sim)

            H_t = em.extract_feature_vecs(S_t,
                                          feature_table=F_t,
                                          attrs_after=['label', 'pair_id'],
                                          show_progress=False)
            H_gs = em.extract_feature_vecs(S_gs,
                                           feature_table=F_t,
                                           attrs_after='label',
                                           show_progress=False)

            H_t = H_t.fillna(-1)
            H_gs = H_gs.fillna(-1)

            validation_ids_df = pd.read_csv(valid_set)
            val_df = H_t[H_t['pair_id'].isin(
                validation_ids_df['pair_id'].values)]
            train_only_df = H_t[~H_t['pair_id'].
                                isin(validation_ids_df['pair_id'].values)]

            train_only_df = train_only_df.drop(columns='pair_id')
            val_df = val_df.drop(columns='pair_id')

            train_only_df = train_only_df.sample(frac=1, random_state=42)

            pos_neg = H_t['label'].value_counts()
            pos_neg = round(pos_neg[0] / pos_neg[1])

            train_ind = []
            val_ind = []

            for i in range(len(train_only_df) - 1):
                train_ind.append(-1)

            for i in range(len(val_df) - 1):
                val_ind.append(0)

            ps = PredefinedSplit(test_fold=np.concatenate((train_ind,
                                                           val_ind)))

            train_df = pd.concat([train_only_df, val_df])

            for k, v in classifiers.items():

                classifier = v['clf']
                if 'random_state' in classifier.get_params().keys():
                    classifier = classifier.set_params(**{'random_state': run})

                # add pos_neg ratio to XGBoost params
                if k == 'XGBoost':
                    v['params']['scale_pos_weight']: [1, pos_neg]

                model = RandomizedSearchCV(cv=ps,
                                           estimator=classifier,
                                           param_distributions=v['params'],
                                           random_state=42,
                                           n_jobs=4,
                                           scoring='f1',
                                           n_iter=500,
                                           pre_dispatch=8,
                                           return_train_score=True)

                feats_train = train_df.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_train = train_df['label']
                feats_gs = H_gs.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_gs = H_gs['label']

                try:
                    model.fit(feats_train, labels_train)
                except ValueError:
                    set_trace()

                parameters = model.best_params_

                score_names = [
                    'mean_train_score', 'std_train_score', 'mean_test_score',
                    'std_test_score'
                ]
                scores = {}
                score_string = ''
                for name in score_names:
                    scores[name] = model.cv_results_[name][model.best_index_]
                    score_string = score_string + name + ': ' + str(
                        scores[name]) + ' '

                feature_names = list(feats_train.columns)

                if k == 'LogisticRegression' or k == 'LinearSVC':
                    most_important_features = model.best_estimator_.coef_
                    word_importance = zip(feature_names,
                                          most_important_features[0].tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)
                if k == 'RandomForest' or k == 'DecisionTree':
                    most_important_features = model.best_estimator_.feature_importances_
                    word_importance = zip(feature_names,
                                          most_important_features.tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)
                if k == 'NaiveBayes':
                    word_importance = ''
                if k == 'XGBoost':
                    most_important_features = model.best_estimator_.feature_importances_
                    word_importance = zip(feature_names,
                                          most_important_features.tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)

                if k == 'LogisticRegression':
                    learner = LogisticRegression(random_state=run,
                                                 solver='liblinear',
                                                 **parameters)
                elif k == 'NaiveBayes':
                    learner = GaussianNB()
                elif k == 'DecisionTree':
                    learner = DecisionTreeClassifier(random_state=run,
                                                     **parameters)
                elif k == 'LinearSVC':
                    learner = LinearSVC(random_state=run,
                                        dual=False,
                                        **parameters)
                elif k == 'RandomForest':
                    learner = RandomForestClassifier(random_state=run,
                                                     n_jobs=4,
                                                     **parameters)
                elif k == 'XGBoost':
                    learner = xgb.XGBClassifier(random_state=run,
                                                n_jobs=4,
                                                **parameters)
                else:
                    print('Learner is not a valid option')
                    break

                model = learner
                feats_train = train_only_df.sample(frac=1, random_state=42)
                feats_train = train_only_df.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_train = train_only_df['label']

                start = time.time()
                model.fit(feats_train, labels_train)
                end = time.time()

                train_time = end - start

                start = time.time()
                preds_gs = model.predict(feats_gs)

                end = time.time()

                pred_time = end - start

                gs_report = classification_report(labels_gs,
                                                  preds_gs,
                                                  output_dict=True)

                feature_report = '+'.join(feature_combination)

                if write_test_set_for_inspection:

                    out_path = '../../../data/processed/wdc-lspc/inspection/{}/magellan/'.format(
                        experiment_name)
                    os.makedirs(os.path.dirname(out_path), exist_ok=True)

                    file_name = '_'.join([
                        os.path.basename(train_set),
                        os.path.basename(test_set), k, feature_report
                    ])
                    file_name = file_name.replace('.csv', '')
                    file_name += f'_{run}.pkl.gz'

                    test_inspection_df = S_gs.copy()
                    if k == 'LinearSVC':
                        proba_gs = model.decision_function(feats_gs).tolist()
                    else:
                        proba_gs = model.predict_proba(feats_gs).tolist()
                    test_inspection_df['pred'] = preds_gs
                    test_inspection_df['Class Prob'] = proba_gs
                    test_inspection_df.to_pickle(out_path + file_name,
                                                 compression='gzip')

                with open(
                        '../../../reports/magellan/{}/{}_{}.csv'.format(
                            experiment_name, report_train_name,
                            report_test_name), "a") as f:
                    f.write(feature_report + '#####' + k + '#####' +
                            str(scores['mean_train_score']) + '#####' +
                            str(scores['std_train_score']) + '#####' +
                            str(scores['mean_test_score']) + '#####' +
                            str(scores['std_test_score']) + '#####' +
                            str(gs_report['1']['precision']) + '#####' +
                            str(gs_report['1']['recall']) + '#####' +
                            str(gs_report['1']['f1-score']) + '#####' +
                            str(parameters) + '#####' + str(train_time) +
                            '#####' + str(pred_time) + '#####' +
                            str(word_importance[0:100]) + '#####' +
                            experiment_name + '#####' + report_train_name +
                            '#####' + report_test_name + '\n')
Ejemplo n.º 24
0
I['ltable_pages'] = ''
I['rtable_pages'] = ''
J = train_test['test']
J['ltable_edition'] = ''
J['rtable_edition'] = ''
J['ltable_pages'] = ''
J['rtable_pages'] = ''

# Save Set I
#em.to_csv_metadata(I, './TableI.csv')
# Save Set J
#em.to_csv_metadata(J, './TableJ.csv')

# Automatic feature generation
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
H = em.extract_feature_vecs(I, feature_table=F, attrs_after=['gold_labels'])
# Fill missing values
H.fillna(value='NaN', inplace=True)

# Create ML matchers
dt = em.DTMatcher(name='DecisionTree')
svm = em.SVMMatcher(name='SVM')
rf = em.RFMatcher(name='RandomForest')
lg = em.LogRegMatcher(name='LogisticRegression')
ln = em.LinRegMatcher(name='LinearRegression')
nb = em.NBMatcher(name='NaiveBayes')
# Select the best matcher
result = em.select_matcher(
    [dt, rf, svm, ln, lg, nb],
    table=H,
    exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold_labels'],
Ejemplo n.º 25
0
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

# The features for matching

F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

# List the names of the features generated
F['feature_name']

#Extract Feature Vectors.
H = em.extract_feature_vecs(I,
                            feature_table=F,
                            attrs_after='Match',
                            show_progress=False)

# compare stats to select best Matcher
#Logistic Regression in our case

result = em.select_matcher(
    [dt, rf, svm, ln, lg, nb],
    table=H,
    exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'Match'],
    k=5,
    target_attr='Match',
    metric_to_select_matcher='f1',
    random_state=0)
result['cv_stats']
Ejemplo n.º 26
0
em.to_csv_metadata(D, 'datasets/tbl_blocked_8.csv')
tbl_blocked = em.read_csv_metadata('datasets/tbl_blocked_8.csv',\
 ltable=sample_movies, rtable=sample_tracks)

S = em.sample_table(tbl_blocked, 400)
em.to_csv_metadata(S, 'datasets/sampled_8.csv')

with open('metadata_8.csv', 'wb') as data1:
    writer = csv.writer(data1, delimiter=',', quotechar='|')
    for entry in S.values:
        l = len(entry)
        item = entry[-4:]
        writer.writerow(item)

match_f = em.get_features_for_matching(sample_movies, sample_tracks)
H = em.extract_feature_vecs(S, feature_table=match_f)
with open('data_8.csv', 'wb') as data:
    writer = csv.writer(data, delimiter=',', quotechar='|')
    flag = 0
    names = []
    for idx, row in H.iterrows():
        item = []
        print row
        for it in row.iteritems():
            if flag:
                names.append(it[0])
            item.append(it[1])
        flag = 1
        writer.writerow(item)
    print names
        'left_ram_frequency', 'left_hdd_capacity', 'left_ssd_capacity',
        'left_weight', 'left_dimensions', 'left_title', 'right_brand',
        'right_cpu_brand', 'right_cpu_model', 'right_cpu_type',
        'right_cpu_frequency', 'right_ram_capacity', 'right_ram_type',
        'right_ram_frequency', 'right_hdd_capacity', 'right_ssd_capacity',
        'right_weight', 'right_dimensions', 'right_title'
    ]
    attrs_to_be_excluded = []
    attrs_to_be_excluded.extend(
        ['_id', 'left_instance_id', 'right_instance_id'])
    attrs_to_be_excluded.extend(attrs_from_table)

    # Convert the cancidate set to feature vectors using the feature table
    L = em.extract_feature_vecs(C,
                                feature_table=feature_table,
                                attrs_before=attrs_from_table,
                                show_progress=True,
                                n_jobs=-1)

    loaded_rf = joblib.load("random_forest.joblib")

    # Predict the matches
    predictions = loaded_rf.predict(
        table=L,
        exclude_attrs=attrs_to_be_excluded,
        append=True,
        target_attr='predicted',
        inplace=False,
    )

    # Prepare the output
Ejemplo n.º 28
0
# L = K1.copy()
# print(L.columns)
print('Loading labels...')
L['gold'] = 0
trues = exact[exact['gold'] == 1][['ltable.id', 'rtable.id']]
L['temp'] = L['ltable_id'].astype(str) + L['rtable_id'].astype(str)
trues['temp'] = trues['ltable.id'].astype(str) + trues['rtable.id'].astype(str)
L.loc[L['temp'].isin(trues['temp']), ['gold']] = 1

development_evaluation = em.split_train_test(L, train_proportion=0.5)
development = development_evaluation['train']
evaluation = development_evaluation['test']

print('Creating feature vectors...')
train_feature_vectors = em.extract_feature_vecs(development,
                                                attrs_after='gold',
                                                feature_table=features)
test_feature_vectors = em.extract_feature_vecs(evaluation,
                                               attrs_after='gold',
                                               feature_table=features)

train_feature_vectors = train_feature_vectors.fillna(0.0)
test_feature_vectors = test_feature_vectors.fillna(0.0)

print("tagged pairs:" + str(exact['gold'].value_counts()))

df = pd.DataFrame(
    columns=['instance', 'candName', 'targName', 'conf', 'realConf'])
epoch = 1
# cands = list(exact['ltable.id'].unique())
# targs = list(exact['rtable.id'].unique())
Ejemplo n.º 29
0
# J_set = myset['test']
# em.to_csv_metadata(I_set, 'datasets/I_set.csv')
# em.to_csv_metadata(J_set, 'datasets/J_set.csv')

# creating feature for matching
match_t = em.get_tokenizers_for_matching()
match_s = em.get_sim_funs_for_matching()
atypes1 = em.get_attr_types(sampled_movies)
atypes2 = em.get_attr_types(sampled_tracks)
match_c = em.get_attr_corres(sampled_movies, sampled_tracks)
match_f = em.get_features(sampled_movies, sampled_tracks, atypes1, atypes2,
                          match_c, match_t, match_s)

# generating feature vectors
H = em.extract_feature_vecs(dev_set,
                            feature_table=match_f,
                            attrs_after='label',
                            show_progress=False)

# filling missing values in feature vectors
H.fillna(value=0, inplace=True)

# creating a set of learning-based matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

# Selecting the best matcher using cross-validation
Ejemplo n.º 30
0
def main():

    A = em.read_csv_metadata('ltable.csv',
                             key="ltable_id",
                             encoding='ISO-8859-1')
    B = em.read_csv_metadata('rtable.csv',
                             key="rtable_id",
                             encoding='ISO-8859-1')

    ob = em.OverlapBlocker()
    C = ob.block_tables(
        A,
        B,
        'title',
        'title',
        l_output_attrs=['title', 'category', 'brand', 'modelno', 'price'],
        r_output_attrs=['title', 'category', 'brand', 'modelno', 'price'],
        overlap_size=1,
        show_progress=False)
    S = em.sample_table(C, 450)

    G = em.read_csv_metadata("train.csv",
                             key='id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='ltable_id',
                             fk_rtable='rtable_id')
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)
    G = em.label_table(S, 'label')

    attrs_from_table = [
        'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno',
        'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand',
        'rtable_modelno', 'rtable_price'
    ]
    H = em.extract_feature_vecs(G,
                                feature_table=feature_table,
                                attrs_before=attrs_from_table,
                                attrs_after='label',
                                show_progress=False)
    H.fillna('0', inplace=True)
    #     H = em.impute_table(
    #         H, exclude_attrs=['_id', 'ltable_ltable_id', 'rtable_rtable_id','label'], strategy='mean')
    rf = em.RFMatcher()

    attrs_to_be_excluded = []
    attrs_to_be_excluded.extend(
        ['_id', 'ltable_ltable_id', 'rtable_rtable_id', 'label'])
    attrs_to_be_excluded.extend(attrs_from_table)

    rf.fit(table=H, exclude_attrs=attrs_to_be_excluded, target_attr='label')

    attrs_from_table = [
        'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno',
        'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand',
        'rtable_modelno', 'rtable_price'
    ]
    L = em.extract_feature_vecs(C,
                                feature_table=feature_table,
                                attrs_before=attrs_from_table,
                                show_progress=False,
                                n_jobs=-1)

    attrs_to_be_excluded = []
    attrs_to_be_excluded.extend(
        ['_id', 'ltable_ltable_id', 'rtable_rtable_id'])
    attrs_to_be_excluded.extend(attrs_from_table)

    predictions = rf.predict(table=L,
                             exclude_attrs=attrs_to_be_excluded,
                             append=True,
                             target_attr='predicted',
                             inplace=False)

    dataset = pd.DataFrame({"id": G[0]['id'], 'label': predictions['label']})
    dataset.to_csv("./prediction2.csv", index=False)