Example #1
0
def prep_data_pair_mallegan(dataset, use_description):
    """
    Preprocessing for MAGELLAN framework
    :param dataset: train or test dataset
    :return: combined MAGELLAN-ready dataset
    """

    # Preprocess pairs (A is left side of the pair, B is right side)
    A = dataset[['content_1']]
    # A = A.reset_index()
    A = A.rename(columns={'content_1': "title"})
    A["l_id"] = A.index
    em.set_key(A, 'l_id')

    B = dataset[['content_2']]
    # B = B.reset_index()
    B = B.rename(columns={'content_2': "title"})
    B["r_id"] = len(A) + B.index
    em.set_key(B, 'r_id')

    # join A and B to retrieve complete pairwise dataset
    G = A.join(B, lsuffix='_left', rsuffix='_right')
    G["label"] = dataset[["label"]]
    G["id"] = G.index
    em.set_key(G, 'id')
    em.set_ltable(G, A)
    em.set_rtable(G, B)
    em.set_fk_ltable(G, 'l_id')
    em.set_fk_rtable(G, 'r_id')

    return A, B, G
Example #2
0
    def predict(self, dataset, impute_value=0):
        dataset = dataset.copy()
        with io.capture_output() as captured:
            dataset['id'] = dataset['left_id'] = dataset['right_id'] = np.arange(dataset.shape[0])
            leftDF = dataset[self.lcolumns].copy()
            leftDF.columns = self.columns
            rightDF = dataset[self.rcolumns].copy()
            rightDF.columns = self.columns

            em.set_key(dataset, 'id')
            em.set_key(leftDF, 'id')
            em.set_key(rightDF, 'id')
            em.set_ltable(dataset, leftDF)
            em.set_rtable(dataset, rightDF)
            em.set_fk_ltable(dataset, 'left_id')
            em.set_fk_rtable(dataset, 'right_id')

            self.exctracted_features = em.extract_feature_vecs(dataset, feature_table=self.feature_table)
            self.exctracted_features = self.exctracted_features.fillna(impute_value)
            exclude_tmp = list(
                set(self.exclude_attrs) - (set(self.exclude_attrs) - set(self.exctracted_features.columns)))
            self.predictions = self.model.predict(table=self.exctracted_features, exclude_attrs=exclude_tmp,
                                                  return_probs=True,
                                                  target_attr='pred', probs_attr='match_score', append=True)
        del dataset
        del captured
        gc.collect()
        return self.predictions['match_score'].values
Example #3
0
def setup_keys(C, l, r):
    em.set_key(C, 'id')
    em.set_key(l, 'a.index')
    em.set_key(r, 'b.index')
    em.set_fk_ltable(C, 'a.index')
    em.set_fk_rtable(C, 'b.index')
    em.set_ltable(C, l)
    em.set_rtable(C, r)
Example #4
0
def read_pair_csv_with_metadata(tuples, filename, key, dtype={}):
    # Way to inject dtype parameter into native? For now, do it "by hand"
    pairs = pd.read_csv(filename, dtype=dtype)
    em.set_key(pairs, key)
    em.set_ltable(pairs, tuples)
    em.set_fk_ltable(pairs, f'ltable_{em.get_key(tuples)}')
    em.set_rtable(pairs, tuples)
    em.set_fk_rtable(pairs, f'rtable_{em.get_key(tuples)}')
    return pairs
Example #5
0
def automatic_feature_gen(candidate_table, feature_cols, id_names,
                          id_names_phrase):
    '''
    NB!
    The automatic function creates pairwise features. Consequently, it will convert
    internally the colnames in lhs and rhs portions of feature cols to the SAME name.
    It does this by trimming the `id_names_phrase` portion (suffix or prefix) from each column name
    It assumes that the id names are of the form id_{id_names_phrase} e.g. id_amzn

    Replaces Nans in candidate table with empty strings

    Takes in a single DataFrame object (lhs_table and rhs_table concatenated) and
    splits it into two tables then generates features on each of the sub tables.

    Inputs:
            candidate_table: single Pandas DataFrame (typically output of blocking_algorithms.py functions)

    Outputs:
    '''

    em.del_catalog()
    candidate_table = candidate_table.reset_index()

    lhs_table = candidate_table.loc[:, feature_cols[0] + [id_names[0]]]
    rhs_table = candidate_table.loc[:, feature_cols[1] + [id_names[1]]]

    lhs_colnames = []
    for colname in lhs_table:
        if colname != id_names[0]:
            lhs_colnames.append(re.sub(id_names_phrase[0], "", colname))
        else:
            lhs_colnames.append(colname)
    rhs_colnames = []
    for colname in rhs_table:
        if colname != id_names[1]:
            rhs_colnames.append(re.sub(id_names_phrase[1], "", colname))
        else:
            rhs_colnames.append(colname)

    lhs_table.columns = lhs_colnames
    rhs_table.columns = rhs_colnames
    # To circumvent the same product ID coming up again (due to it being in multiple candidate comparisons)
    lhs_table["index_num_lhs"] = np.arange(lhs_table.shape[0])
    rhs_table["index_num_rhs"] = np.arange(rhs_table.shape[0])

    em.set_key(lhs_table, "index_num_lhs")  # changed from id_names
    em.set_key(rhs_table, "index_num_rhs")
    # Generate List Of Features
    matching_features = em.get_features_for_matching(
        lhs_table.drop(id_names[0], axis=1),
        rhs_table.drop(id_names[1], axis=1),
        validate_inferred_attr_types=False)
    # Extract feature vectors and save as a  DF
    # Set primary keys and foreign keys for candidate table
    candidate_table["index"] = np.arange(candidate_table.shape[0])
    # Add foreign keys to candidate table
    candidate_table["index_num_lhs"] = np.arange(lhs_table.shape[0])
    candidate_table["index_num_rhs"] = np.arange(rhs_table.shape[0])

    em.set_key(candidate_table, "index")
    em.set_fk_ltable(candidate_table, "index_num_lhs")
    em.set_fk_rtable(candidate_table, "index_num_rhs")
    em.set_ltable(candidate_table, lhs_table)
    em.set_rtable(candidate_table, rhs_table)

    matching_features_df = em.extract_feature_vecs(
        candidate_table, feature_table=matching_features, show_progress=False)

    matching_features_df = em.impute_table(
        matching_features_df,
        exclude_attrs=['index', "index_num_lhs", "index_num_rhs"],
        strategy='mean')
    # add back the amzn and google ids
    matching_features_df["id_amzn"] = candidate_table.id_amzn
    matching_features_df["id_g"] = candidate_table.id_g

    matching_features_df = matching_features_df.fillna(value=0)

    # print(matching_features_df.describe())
    # print(f"Number na {matching_features_df.isna().apply(sum)}")
    # print(f"Number null {matching_features_df.isnull().apply(sum)}")
    return matching_features_df
Example #6
0
C.shape


# We can see that by doing a similarity join, we already reduced the candidate set to 18,317 pairs.
# 
# #### Substep B: Specifying the keys 
# The next step is to specify to the **py_entitymatching** package which columns correspond to the keys in each dataframe. Also, we need to specify which columns correspond to the foreign keys of the the two dataframes in the candidate set.

# In[22]:

import py_entitymatching as em
em.set_key(kaggle_data, 'id')   # specifying the key column in the kaggle dataset
em.set_key(imdb_data, 'id')     # specifying the key column in the imdb dataset
em.set_key(C, '_id')            # specifying the key in the candidate set
em.set_ltable(C, kaggle_data)   # specifying the left table 
em.set_rtable(C, imdb_data)     # specifying the right table
em.set_fk_rtable(C, 'r_id')     # specifying the column that matches the key in the right table 
em.set_fk_ltable(C, 'l_id')     # specifying the column that matches the key in the left table 


# 
# #### Subset C: Debugging the blocker
# 
# Now, we need to make sure that the candidate set is loose enough to include pairs of movies that are not very close. If this is not the case, there is a chance that we have eliminated pair that could be potentially matched together. By looking at a few pairs from the candidate set, we can judge whether the blocking step has been too harsh or not.
# 
# *Note: The **py_entitymatching** package provides some tools for debugging the blocker as well.*

# In[23]:

C[['l_norm_movie_title', 'r_norm_title', 'l_norm_title_year', 'r_norm_year',
   'l_budget', 'r_budget', 'l_content_rating', 'r_mpaa']].head()
Example #7
0
# Load labeled data fom previous session

# In[105]:

G = em.load_object('./GoldenData.pkl')
len(G)

# In[106]:

## Loading G into em catalog

em.set_fk_ltable(G, 'ltable_ID')
em.set_fk_rtable(G, 'rtable_ID')
em.set_key(G, '_id')
em.set_ltable(G, A)
em.set_rtable(G, B)

# In[107]:

## Find number of positive and negative examples
G.groupby('gold_labels').count()

# <h1> Splitting the labeled data into development and evaluation set
#
#
#

# In this step, we split the labeled data into two sets: development (I) and evaluation (J). Specifically, the development set is used to come up with the best learning-based matcher and the evaluation set used to evaluate the selected matcher on unseen data.

# In[109]:
hotels['mixture'] = hotels['norm_name'] + ' ' + hotels['norm_address'] 

C = ssj.overlap_coefficient_join(hotels, booking, 'id','id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), 
                                 l_out_attrs=['link','norm_name','name','norm_address','head_address','rooms', 'norm_star', 'benefits','norm_image','rating','destination'], 
                                 r_out_attrs=['link','norm_name','name','norm_address','head_address','rooms', 'norm_star', 'benefits','norm_image','rating','destination'], 
                                 threshold=0.7)
print(C.shape)


# Creating the Rule-Based Matcher
import py_entitymatching as em
em.set_key(hotels, 'id')
em.set_key(booking, 'id')
em.set_key(C, '_id')
em.set_ltable(C, hotels)
em.set_rtable(C, booking)
em.set_fk_rtable(C, 'r_id')
em.set_fk_ltable(C, 'l_id')

brm = em.BooleanRuleMatcher()

booking.dtypes
# Generate a set of features
F = em.get_features_for_matching(hotels, booking, validate_inferred_attr_types=False)
F.feature_name


#row = 45
#lev.get_sim_score(C.iloc[[row]]['l_norm_name'][row],C.iloc[[row]]['r_norm_name'][row])

# excute sim score