def prep_data_pair_mallegan(dataset, use_description): """ Preprocessing for MAGELLAN framework :param dataset: train or test dataset :return: combined MAGELLAN-ready dataset """ # Preprocess pairs (A is left side of the pair, B is right side) A = dataset[['content_1']] # A = A.reset_index() A = A.rename(columns={'content_1': "title"}) A["l_id"] = A.index em.set_key(A, 'l_id') B = dataset[['content_2']] # B = B.reset_index() B = B.rename(columns={'content_2': "title"}) B["r_id"] = len(A) + B.index em.set_key(B, 'r_id') # join A and B to retrieve complete pairwise dataset G = A.join(B, lsuffix='_left', rsuffix='_right') G["label"] = dataset[["label"]] G["id"] = G.index em.set_key(G, 'id') em.set_ltable(G, A) em.set_rtable(G, B) em.set_fk_ltable(G, 'l_id') em.set_fk_rtable(G, 'r_id') return A, B, G
def predict(self, dataset, impute_value=0): dataset = dataset.copy() with io.capture_output() as captured: dataset['id'] = dataset['left_id'] = dataset['right_id'] = np.arange(dataset.shape[0]) leftDF = dataset[self.lcolumns].copy() leftDF.columns = self.columns rightDF = dataset[self.rcolumns].copy() rightDF.columns = self.columns em.set_key(dataset, 'id') em.set_key(leftDF, 'id') em.set_key(rightDF, 'id') em.set_ltable(dataset, leftDF) em.set_rtable(dataset, rightDF) em.set_fk_ltable(dataset, 'left_id') em.set_fk_rtable(dataset, 'right_id') self.exctracted_features = em.extract_feature_vecs(dataset, feature_table=self.feature_table) self.exctracted_features = self.exctracted_features.fillna(impute_value) exclude_tmp = list( set(self.exclude_attrs) - (set(self.exclude_attrs) - set(self.exctracted_features.columns))) self.predictions = self.model.predict(table=self.exctracted_features, exclude_attrs=exclude_tmp, return_probs=True, target_attr='pred', probs_attr='match_score', append=True) del dataset del captured gc.collect() return self.predictions['match_score'].values
def setup_keys(C, l, r): em.set_key(C, 'id') em.set_key(l, 'a.index') em.set_key(r, 'b.index') em.set_fk_ltable(C, 'a.index') em.set_fk_rtable(C, 'b.index') em.set_ltable(C, l) em.set_rtable(C, r)
def read_pair_csv_with_metadata(tuples, filename, key, dtype={}): # Way to inject dtype parameter into native? For now, do it "by hand" pairs = pd.read_csv(filename, dtype=dtype) em.set_key(pairs, key) em.set_ltable(pairs, tuples) em.set_fk_ltable(pairs, f'ltable_{em.get_key(tuples)}') em.set_rtable(pairs, tuples) em.set_fk_rtable(pairs, f'rtable_{em.get_key(tuples)}') return pairs
def automatic_feature_gen(candidate_table, feature_cols, id_names, id_names_phrase): ''' NB! The automatic function creates pairwise features. Consequently, it will convert internally the colnames in lhs and rhs portions of feature cols to the SAME name. It does this by trimming the `id_names_phrase` portion (suffix or prefix) from each column name It assumes that the id names are of the form id_{id_names_phrase} e.g. id_amzn Replaces Nans in candidate table with empty strings Takes in a single DataFrame object (lhs_table and rhs_table concatenated) and splits it into two tables then generates features on each of the sub tables. Inputs: candidate_table: single Pandas DataFrame (typically output of blocking_algorithms.py functions) Outputs: ''' em.del_catalog() candidate_table = candidate_table.reset_index() lhs_table = candidate_table.loc[:, feature_cols[0] + [id_names[0]]] rhs_table = candidate_table.loc[:, feature_cols[1] + [id_names[1]]] lhs_colnames = [] for colname in lhs_table: if colname != id_names[0]: lhs_colnames.append(re.sub(id_names_phrase[0], "", colname)) else: lhs_colnames.append(colname) rhs_colnames = [] for colname in rhs_table: if colname != id_names[1]: rhs_colnames.append(re.sub(id_names_phrase[1], "", colname)) else: rhs_colnames.append(colname) lhs_table.columns = lhs_colnames rhs_table.columns = rhs_colnames # To circumvent the same product ID coming up again (due to it being in multiple candidate comparisons) lhs_table["index_num_lhs"] = np.arange(lhs_table.shape[0]) rhs_table["index_num_rhs"] = np.arange(rhs_table.shape[0]) em.set_key(lhs_table, "index_num_lhs") # changed from id_names em.set_key(rhs_table, "index_num_rhs") # Generate List Of Features matching_features = em.get_features_for_matching( lhs_table.drop(id_names[0], axis=1), rhs_table.drop(id_names[1], axis=1), validate_inferred_attr_types=False) # Extract feature vectors and save as a DF # Set primary keys and foreign keys for candidate table candidate_table["index"] = np.arange(candidate_table.shape[0]) # Add foreign keys to candidate table candidate_table["index_num_lhs"] = np.arange(lhs_table.shape[0]) candidate_table["index_num_rhs"] = np.arange(rhs_table.shape[0]) em.set_key(candidate_table, "index") em.set_fk_ltable(candidate_table, "index_num_lhs") em.set_fk_rtable(candidate_table, "index_num_rhs") em.set_ltable(candidate_table, lhs_table) em.set_rtable(candidate_table, rhs_table) matching_features_df = em.extract_feature_vecs( candidate_table, feature_table=matching_features, show_progress=False) matching_features_df = em.impute_table( matching_features_df, exclude_attrs=['index', "index_num_lhs", "index_num_rhs"], strategy='mean') # add back the amzn and google ids matching_features_df["id_amzn"] = candidate_table.id_amzn matching_features_df["id_g"] = candidate_table.id_g matching_features_df = matching_features_df.fillna(value=0) # print(matching_features_df.describe()) # print(f"Number na {matching_features_df.isna().apply(sum)}") # print(f"Number null {matching_features_df.isnull().apply(sum)}") return matching_features_df
# We can see that by doing a similarity join, we already reduced the candidate set to 18,317 pairs. # # #### Substep B: Specifying the keys # The next step is to specify to the **py_entitymatching** package which columns correspond to the keys in each dataframe. Also, we need to specify which columns correspond to the foreign keys of the the two dataframes in the candidate set. # In[22]: import py_entitymatching as em em.set_key(kaggle_data, 'id') # specifying the key column in the kaggle dataset em.set_key(imdb_data, 'id') # specifying the key column in the imdb dataset em.set_key(C, '_id') # specifying the key in the candidate set em.set_ltable(C, kaggle_data) # specifying the left table em.set_rtable(C, imdb_data) # specifying the right table em.set_fk_rtable(C, 'r_id') # specifying the column that matches the key in the right table em.set_fk_ltable(C, 'l_id') # specifying the column that matches the key in the left table # # #### Subset C: Debugging the blocker # # Now, we need to make sure that the candidate set is loose enough to include pairs of movies that are not very close. If this is not the case, there is a chance that we have eliminated pair that could be potentially matched together. By looking at a few pairs from the candidate set, we can judge whether the blocking step has been too harsh or not. # # *Note: The **py_entitymatching** package provides some tools for debugging the blocker as well.* # In[23]: C[['l_norm_movie_title', 'r_norm_title', 'l_norm_title_year', 'r_norm_year', 'l_budget', 'r_budget', 'l_content_rating', 'r_mpaa']].head()
##Label S G = em.label_table(S, 'gold_labels') # Load labeled data fom previous session # In[105]: G = em.load_object('./GoldenData.pkl') len(G) # In[106]: ## Loading G into em catalog em.set_fk_ltable(G, 'ltable_ID') em.set_fk_rtable(G, 'rtable_ID') em.set_key(G, '_id') em.set_ltable(G, A) em.set_rtable(G, B) # In[107]: ## Find number of positive and negative examples G.groupby('gold_labels').count() # <h1> Splitting the labeled data into development and evaluation set # # #
C = ssj.overlap_coefficient_join(hotels, booking, 'id','id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), l_out_attrs=['link','norm_name','name','norm_address','head_address','rooms', 'norm_star', 'benefits','norm_image','rating','destination'], r_out_attrs=['link','norm_name','name','norm_address','head_address','rooms', 'norm_star', 'benefits','norm_image','rating','destination'], threshold=0.7) print(C.shape) # Creating the Rule-Based Matcher import py_entitymatching as em em.set_key(hotels, 'id') em.set_key(booking, 'id') em.set_key(C, '_id') em.set_ltable(C, hotels) em.set_rtable(C, booking) em.set_fk_rtable(C, 'r_id') em.set_fk_ltable(C, 'l_id') brm = em.BooleanRuleMatcher() booking.dtypes # Generate a set of features F = em.get_features_for_matching(hotels, booking, validate_inferred_attr_types=False) F.feature_name #row = 45 #lev.get_sim_score(C.iloc[[row]]['l_norm_name'][row],C.iloc[[row]]['r_norm_name'][row]) # excute sim score #C['pred_label']=0 #for row in range(C.shape[0]):