def preprocess(df, encode, categorize, preran): y = df["area"] X = df.drop( ["area"], axis=1, ) X.info() if encode: encode_columns = [] n_prototypes = 5 if not preran: enc = SimilarityEncoder(similarity="ngram", categories="k-means", n_prototypes=n_prototypes) enc.fit(X[encode_columns].values) pd.to_pickle(enc, "encoders/similarity_encoder.pickle") else: enc = pd.read_pickle("encoders/similarity_encoder.pickle") transformed_values = enc.transform(X[encode_columns].values) transformed_values = pd.DataFrame(transformed_values, index=X.index) transformed_columns = [] for col in encode_columns: for i in range(0, n_prototypes): transformed_columns.append(col + "_" + str(i)) transformed_values.columns = transformed_columns X = pd.concat([X, transformed_values], axis=1) X = X.drop(encode_columns, axis=1) if categorize: obj_cols = X.select_dtypes("object").columns X[obj_cols] = X[obj_cols].astype("category") return X, y
class FitSimilarityEncoder: def __init__(self, col_name): self.col_name = col_name def fit(self, df, similarity="ngram", categories="most_frequent", n_prototypes=100): # Initialaze the similarity encoder self.similarity_encoder = SimilarityEncoder(similarity=similarity, dtype=np.float32, categories=categories, n_prototypes=n_prototypes, random_state=1006) # Fit the similarity encoder self.similarity_encoder.fit(df[self.col_name].values.reshape(-1, 1)) def transform(self, df): return self.similarity_encoder.transform( df[self.col_name].values.reshape(-1, 1))
def similarity_encode(X, encode_columns, n_prototypes, train, drop_original): X = X.copy() if train: enc = SimilarityEncoder(similarity="ngram", categories="k-means", n_prototypes=n_prototypes) enc.fit(X[encode_columns].values) Path("encoders").mkdir(exist_ok=True) pd.to_pickle(enc, "encoders/similarity_encoder.pickle") else: enc = pd.read_pickle("encoders/similarity_encoder.pickle") transformed_values = enc.transform(X[encode_columns].values) transformed_values = pd.DataFrame(transformed_values, index=X.index) transformed_columns = [] for col in encode_columns: for i in range(0, n_prototypes): transformed_columns.append(col + "_" + str(i)) transformed_values.columns = transformed_columns X = pd.concat([X, transformed_values], axis=1) if drop_original: X = X.drop(encode_columns, axis=1) return X
def categorical_encoding(A, B, y_train, encoder, clf_type, n_jobs): '''Build the matrix of encoders. Given two arrays of strings to compare an a encoder, returns the corresponding encoder matrix of size len(A)xlen(B)''' if encoder == 'levenshtein-ratio_SimilarityEncoder': B = np.unique(B).reshape(-1, 1) encoder = SimilarityEncoder(similarity='levenshtein-ratio') encoder.fit(B) se = encoder.transform(A.reshape(-1, 1)) return se if encoder == 'one-hot_encoding': return one_hot_encoding(A, B) if encoder == 'one-hot_encoding_sparse': return sparse.csr_matrix(one_hot_encoding_sparse(A, B)) if encoder == 'jaccard_similarity': B = np.unique(B) warning = (('Warning: %s is not a well defined similarity ' + 'metric because two different values can have a ' + 'similarity of 1') % encoder) print(warning) unqA = np.unique(A) vlev = np.vectorize(dist.jaccard) # dvec = Parallel(n_jobs=n_jobs)(delayed(vlev)(a, B.reshape(1, -1)) # for a in unqA) dvec = [vlev(a, B.reshape(1, -1)) for a in unqA] ddict = {unqA[i]: dvec[i] for i in range(len(dvec))} dms = (ddict[a] for a in A) dm = np.vstack(dms) return 1 - dm if encoder == 'sorensen_similarity': B = np.unique(B) unqA = np.unique(A) vlev = np.vectorize(dist.sorensen) # dvec = Parallel(n_jobs=n_jobs)(delayed(vlev)(a, B.reshape(1, -1)) # for a in unqA) dvec = [vlev(a, B.reshape(1, -1)) for a in unqA] ddict = {unqA[i]: dvec[i] for i in range(len(dvec))} dms = (ddict[a] for a in A) dm = np.vstack(dms) return 1 - dm if encoder == 'jaro-winkler_SimilarityEncoder': B = np.unique(B).reshape(-1, 1) encoder = SimilarityEncoder(similarity='jaro-winkler') encoder.fit(B) se = encoder.transform(A.reshape(-1, 1)) return se if encoder[1:] == 'gram_SimilarityEncoder': n = int(encoder[0]) B = np.unique(B).reshape(-1, 1) encoder = SimilarityEncoder() encoder.fit(B) return encoder.transform(A.reshape(-1, 1)) if encoder[1:] == 'gram_similarity2': n = int(encoder[0]) B = np.unique(B) return ngram_similarity(A, B, n, sim_type='sim2') if encoder[1:] == 'gram_presence_fisher_kernel': n = int(encoder[0]) return ngram_similarity(A, B, n, sim_type='fisher_kernel') if encoder[1:] == 'gram_similarity2_1': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim2_1') return sm if encoder[1:] == 'gram_similarity2_2': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim2_2') return sm if encoder[1:] == 'gram_similarity3': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim3') return sm if encoder[1:] == 'gram_similarity3_2': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim3_2') return sm if encoder[1:] == 'gram_similarity4': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim4') return sm if encoder[1:] == 'gram_similarity5': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim5') return sm if encoder[1:] == 'gram_similarity6': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim6') return sm if encoder[1:] == 'gram_similarity7': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim7') return sm if encoder[1:] == 'grams_count_vectorizer': n = int(encoder[0]) B = np.unique(B) vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n)) vectorizer.fit(B) return vectorizer.transform(A) if encoder[1:] == 'grams_tfidf_vectorizer': n = int(encoder[0]) B = np.unique(B) vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(n, n), smooth_idf=False) vectorizer.fit(B) return vectorizer.transform(A) if encoder[1:] == 'grams_tf_vectorizer': n = int(encoder[0]) B = np.unique(B) vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(n, n), smooth_idf=False, use_idf=False) vectorizer.fit(B) return vectorizer.transform(A) if encoder[1:] == 'grams_hot_vectorizer': n = int(encoder[0]) B = np.unique(B) vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n)) vectorizer.fit(B) count_matrix1 = vectorizer.transform(A) return (count_matrix1 > 0).astype('float64') if encoder[1:] == 'grams_hot_vectorizer_tfidf': n = int(encoder[0]) B = np.unique(B) vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n)) presenceB = (vectorizer.fit_transform(B) > 0).astype('float64') presenceA = (vectorizer.transform(A) > 0).astype('float64') transformer = TfidfTransformer(smooth_idf=True) transformer.fit(presenceB) tfidfA = transformer.transform(presenceA) return tfidfA if encoder[1:] == 'grams_hashing': n = int(encoder[0]) hashingA = ngrams_hashing_vectorizer(A, n, 10000) return hashingA if encoder == 'TargetEncoder': encoder = TargetEncoder(clf_type=clf_type, handle_unknown='ignore') encoder.fit(B.reshape(-1, 1), y_train) return encoder.transform(A.reshape(-1, 1)) if encoder == 'MDVEncoder': return mdv_encoding(A, B, y_train, clf_type) if encoder == 'BackwardDifferenceEncoder': encoder = ce.BackwardDifferenceEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'BinaryEncoder': encoder = ce.BinaryEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'HashingEncoder': encoder = ce.HashingEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'HelmertEncoder': encoder = ce.HelmertEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'OneHotEncoder': encoder = ce.OneHotEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'OrdinalEncoder': encoder = ce.OrdinalEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'SumEncoder': encoder = ce.SumEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'PolynomialEncoder': encoder = ce.PolynomialEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'BaseNEncoder': encoder = ce.BaseNEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'LeaveOneOutEncoder': encoder = ce.LeaveOneOutEncoder() encoder.fit(B, y_train) return encoder.transform(A) else: message = 'Encoder %s has not been implemented yet.' % encoder return message