def preprocess(df, encode, categorize, preran): y = df["area"] X = df.drop( ["area"], axis=1, ) X.info() if encode: encode_columns = [] n_prototypes = 5 if not preran: enc = SimilarityEncoder(similarity="ngram", categories="k-means", n_prototypes=n_prototypes) enc.fit(X[encode_columns].values) pd.to_pickle(enc, "encoders/similarity_encoder.pickle") else: enc = pd.read_pickle("encoders/similarity_encoder.pickle") transformed_values = enc.transform(X[encode_columns].values) transformed_values = pd.DataFrame(transformed_values, index=X.index) transformed_columns = [] for col in encode_columns: for i in range(0, n_prototypes): transformed_columns.append(col + "_" + str(i)) transformed_values.columns = transformed_columns X = pd.concat([X, transformed_values], axis=1) X = X.drop(encode_columns, axis=1) if categorize: obj_cols = X.select_dtypes("object").columns X[obj_cols] = X[obj_cols].astype("category") return X, y
def prune_characters(char_occ_dict, threshold=0.1): from dirty_cat import SimilarityEncoder from sklearn.preprocessing import minmax_scale from scipy.cluster.hierarchy import dendrogram, linkage, fcluster from scipy.spatial.distance import squareform simenc = SimilarityEncoder(similarity='jaro-winkler') transf = simenc.fit_transform(np.array(sorted(char_occ_dict.keys())).reshape(-1, 1)) corr_dist = minmax_scale(-transf) dense_distance = squareform(corr_dist, checks=False) Z = linkage(dense_distance, 'average', optimal_ordering=True) return get_merged_characters(Z, char_occ_dict, threshold=threshold)
def fit(self, df, similarity="ngram", categories="most_frequent", n_prototypes=100): # Initialaze the similarity encoder self.similarity_encoder = SimilarityEncoder(similarity=similarity, dtype=np.float32, categories=categories, n_prototypes=n_prototypes, random_state=1006) # Fit the similarity encoder self.similarity_encoder.fit(df[self.col_name].values.reshape(-1, 1))
def benchmark(strat='k-means', limit=50000, n_proto=100, hash_dim=None, ngram_range=(3, 3)): df = dfr[:limit].copy() df = df.dropna(axis=0) df = df.reset_index() y = df['Violation Type'] if strat == 'k-means': sim_enc = SimilarityEncoder(similarity='ngram', ngram_range=ngram_range, categories='k-means', hashing_dim=hash_dim, n_prototypes=n_proto, random_state=3498) else: sim_enc = SimilarityEncoder(similarity='ngram', ngram_range=ngram_range, categories='most_frequent', hashing_dim=hash_dim, n_prototypes=n_proto, random_state=3498) column_trans = ColumnTransformer(transformers=transformers + [('sim_enc', sim_enc, ['Description'])], remainder='drop') t0 = time() X = column_trans.fit_transform(df) t1 = time() t_score_1 = t1 - t0 model = pipeline.Pipeline([('logistic', linear_model.LogisticRegression()) ]) t0 = time() m_score = model_selection.cross_val_score(model, X, y, cv=20) t1 = time() t_score_2 = t1 - t0 return t_score_1, m_score, t_score_2
class FitSimilarityEncoder: def __init__(self, col_name): self.col_name = col_name def fit(self, df, similarity="ngram", categories="most_frequent", n_prototypes=100): # Initialaze the similarity encoder self.similarity_encoder = SimilarityEncoder(similarity=similarity, dtype=np.float32, categories=categories, n_prototypes=n_prototypes, random_state=1006) # Fit the similarity encoder self.similarity_encoder.fit(df[self.col_name].values.reshape(-1, 1)) def transform(self, df): return self.similarity_encoder.transform( df[self.col_name].values.reshape(-1, 1))
def similarity_encode(X, encode_columns, n_prototypes, train, drop_original): X = X.copy() if train: enc = SimilarityEncoder(similarity="ngram", categories="k-means", n_prototypes=n_prototypes) enc.fit(X[encode_columns].values) Path("encoders").mkdir(exist_ok=True) pd.to_pickle(enc, "encoders/similarity_encoder.pickle") else: enc = pd.read_pickle("encoders/similarity_encoder.pickle") transformed_values = enc.transform(X[encode_columns].values) transformed_values = pd.DataFrame(transformed_values, index=X.index) transformed_columns = [] for col in encode_columns: for i in range(0, n_prototypes): transformed_columns.append(col + "_" + str(i)) transformed_values.columns = transformed_columns X = pd.concat([X, transformed_values], axis=1) if drop_original: X = X.drop(encode_columns, axis=1) return X
############################################################################### # As we will see, SimilarityEncoder takes a while on such data. ############################################################################### # SimilarityEncoder with default options # -------------------------------------- # # Let us build our vectorizer, using a ColumnTransformer to combine # one-hot encoding and similarity encoding from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer from dirty_cat import SimilarityEncoder sim_enc = SimilarityEncoder(similarity='ngram') transformers = [ ('one_hot', OneHotEncoder(sparse=False, handle_unknown='ignore'), clean_columns), ] column_trans = ColumnTransformer( transformers=transformers + [('sim_enc', sim_enc, dirty_columns)], remainder='drop') t0 = time() X = column_trans.fit_transform(df) t1 = time() print('Time to vectorize: %s' % (t1 - t0)) ###############################################################################
"EmploymentStatus", "DateofTermination", "LastPerformanceReview_Date", "EmpStatusID", "TermReason", ], axis=1, ) X.info() date_cols = X.select_dtypes("datetime") for col in date_cols: X = encode_dates(X, col) encode_columns = ["Employee_Name", "Position", "ManagerName"] enc = SimilarityEncoder(similarity="ngram", categories="k-means", n_prototypes=4) for col in encode_columns: transformed_values = enc.fit_transform(X[col].values.reshape(-1, 1)) transformed_values = pd.DataFrame(transformed_values, index=X.index) transformed_values.columns = [f"{col}_" + str(num) for num in transformed_values] X = pd.concat([X, transformed_values], axis=1) X = X.drop(col, axis=1) obj_cols = X.select_dtypes("object").columns X[obj_cols] = X[obj_cols].astype("category") SEED = 0 SAMPLE_SIZE = 5000 Xt, Xv, yt, yv = train_test_split(
columns_names = df.columns ############################################################################### # Estimators construction # ----------------------- # Our input is categorical, thus needs to be encoded. As observations often # consist in variations around a few concepts (for instance, # :code:`'Amlodipine Besylate'` and # :code:`'Amlodipine besylate and atorvastatin calcium'` # have one ingredient in common), we need an encoding able to # capture similarities between observations. from dirty_cat import SimilarityEncoder from sklearn.compose import make_column_transformer from sklearn.preprocessing import OneHotEncoder similarity_encoder = SimilarityEncoder(similarity='ngram') ############################################################################### # Two other columns are used to predict the output: ``DOSAGEFORMNAME`` and # ``ROUTENAME``. They are both categorical and can be encoded with a # |OneHotEncoder|. We use a |ColumnTransformer| to stack the |OneHotEncoder| # and the |SE|. We can now choose a kernel method, for instance a |SVC|, to # fit the encoded inputs. from sklearn.pipeline import Pipeline from sklearn.svm import SVC column_transformer = make_column_transformer( (similarity_encoder, ['NONPROPRIETARYNAME']), (OneHotEncoder(handle_unknown='ignore'), ['DOSAGEFORMNAME', 'ROUTENAME']), sparse_threshold=1)
from sklearn.preprocessing import FunctionTransformer from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.linear_model import RidgeCV from sklearn.model_selection import KFold from dirty_cat import datasets from dirty_cat import SimilarityEncoder, TargetEncoder # encoding methods encoder_dict = { 'one-hot': OneHotEncoder(handle_unknown='ignore'), 'similarity': SimilarityEncoder(similarity='ngram', handle_unknown='ignore'), 'target': TargetEncoder(handle_unknown='ignore'), 'num': FunctionTransformer(None) } data_file = datasets.fetch_employee_salaries() for method in ['one-hot', 'target', 'similarity']: # Load the data df = pd.read_csv(data_file).astype(str) df['Current Annual Salary'] = [float(s[1:]) for s in df['Current Annual Salary']] df['Year First Hired'] = [int(s.split('/')[-1]) for s in df['Date First Hired']] target_column = 'Current Annual Salary'
# ----------------------- # # The one-hot encoder is actually not well suited to the 'Employee # Position Title' column, as this columns contains 400 different entries: import numpy as np np.unique(y) # %% # We will now experiment with encoders specially made for handling # dirty columns from dirty_cat import SimilarityEncoder, TargetEncoder, MinHashEncoder,\ GapEncoder encoders = { 'one-hot': one_hot, 'similarity': SimilarityEncoder(similarity='ngram'), 'target': TargetEncoder(handle_unknown='ignore'), 'minhash': MinHashEncoder(n_components=100), 'gap': GapEncoder(n_components=100), } # %% # We now loop over the different encoding methods, # instantiate a new |Pipeline| each time, fit it # and store the returned cross-validation score: from sklearn.model_selection import cross_val_score all_scores = dict() for name, method in encoders.items():
# employee's position title # data values = data[['Employee Position Title', 'Gender', 'Current Annual Salary']] ######################################################################### # String similarity between entries # ------------------------------------------------- # # That's where our encoders get into play. In order to robustly # embed dirty semantic data, the SimilarityEncoder creates a similarity # matrix based on the 3-gram structure of the data. sorted_values = values['Employee Position Title'].sort_values().unique() from dirty_cat import SimilarityEncoder similarity_encoder = SimilarityEncoder(similarity='ngram') transformed_values = similarity_encoder.fit_transform( sorted_values.reshape(-1, 1)) ######################################################################### # Plotting the new representation using multi-dimensional scaling # ................................................................ # # lets now plot a couple points at random using a low-dimensional representation # to get an intuition of what the similarity encoder is doing: from sklearn.manifold import MDS mds = MDS(dissimilarity='precomputed', n_init=10, random_state=42) two_dim_data = mds.fit_transform(1 - transformed_values) # transformed values lie # in the 0-1 range, so 1-transformed_value yields a positive dissimilarity matrix
def __init__(self, encoder_name, reduction_method=None, ngram_range=(2, 4), categories='auto', dtype=np.float64, handle_unknown='ignore', clf_type=None, n_components=None): self.ngram_range = ngram_range self.encoder_name = encoder_name self.categories = categories self.dtype = dtype self.clf_type = clf_type self.handle_unknown = handle_unknown self.reduction_method = reduction_method self.n_components = n_components self.encoders_dict = { 'OneHotEncoder': OneHotEncoder(handle_unknown='ignore'), 'OneHotEncoder-1': OneHotEncoderRemoveOne(handle_unknown='ignore'), 'Categorical': None, 'OneHotEncoderDense': OneHotEncoder(handle_unknown='ignore', sparse=False), 'OneHotEncoderDense-1': OneHotEncoderRemoveOne(handle_unknown='ignore', sparse=False), 'SimilarityEncoder': SimilarityEncoder(ngram_range=self.ngram_range, random_state=10), 'NgramNaiveFisherKernel': NgramNaiveFisherKernel(ngram_range=self.ngram_range, random_state=10), 'ngrams_hot_vectorizer': [], 'NgramsCountVectorizer': CountVectorizer(analyzer='char', ngram_range=self.ngram_range), 'NgramsTfIdfVectorizer': TfidfVectorizer(analyzer='char', ngram_range=self.ngram_range, smooth_idf=False), 'WordNgramsTfIdfVectorizer': TfidfVectorizer(analyzer='word', ngram_range=(1, 1), smooth_idf=False), 'TargetEncoder': TargetEncoder(clf_type=self.clf_type, handle_unknown='ignore'), 'MDVEncoder': MDVEncoder(self.clf_type), 'BackwardDifferenceEncoder': cat_enc.BackwardDifferenceEncoder(), 'BinaryEncoder': cat_enc.BinaryEncoder(), 'HashingEncoder': cat_enc.HashingEncoder(), 'HelmertEncoder': cat_enc.HelmertEncoder(), 'SumEncoder': cat_enc.SumEncoder(), 'PolynomialEncoder': cat_enc.PolynomialEncoder(), 'BaseNEncoder': cat_enc.BaseNEncoder(), 'LeaveOneOutEncoder': cat_enc.LeaveOneOutEncoder(), 'NgramsLDA': Pipeline([ ('ngrams_count', CountVectorizer(analyzer='char', ngram_range=self.ngram_range)), ( 'LDA', LatentDirichletAllocation(n_components=self.n_components, learning_method='batch'), ) ]), 'NMF': Pipeline([('ngrams_count', CountVectorizer(analyzer='char', ngram_range=self.ngram_range)), ('NMF', NMF(n_components=self.n_components))]), 'WordNMF': Pipeline([('ngrams_count', CountVectorizer(analyzer='word', ngram_range=(1, 1))), ('NMF', NMF(n_components=self.n_components))]), 'NgramsMultinomialMixture': NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10), 'AdHocNgramsMultinomialMixture': AdHocNgramsMultinomialMixture(n_iters=0), 'AdHocIndependentPDF': AdHocIndependentPDF(), 'OnlineGammaPoissonFactorization': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, rho=.99, r=None, tol=1e-4, random_state=18, init='k-means++', ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=10), 'OnlineGammaPoissonFactorization2': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init='k-means++', ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20), 'OnlineGammaPoissonFactorization3': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init='k-means', ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20), 'OnlineGammaPoissonFactorization4': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=None, rho=.95, batch_size=256, tol=1e-4, random_state=18, init='k-means', ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20), 'WordOnlineGammaPoissonFactorization': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=.3, tol=1e-4, random_state=18, init='k-means++', ngram_range=(1, 1), analizer='word', rescale_W=True, max_iter_e_step=10), 'OnlineGammaPoissonFactorization_fast': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=.3, ngram_range=(3, 3), max_iter=1, min_iter=1, tol=1e-4, random_state=18, init='k-means++', rescale_W=False), 'MinHashEncoder': MinHashEncoder(n_components=self.n_components), 'PretrainedFastText': PretrainedFastText(n_components=self.n_components), 'PretrainedFastText_fr': PretrainedFastText(n_components=self.n_components, language='french'), 'PretrainedFastText_hu': PretrainedFastText(n_components=self.n_components, language='hungarian'), None: FunctionTransformer(None, validate=True), 'Passthrough': PasstroughEncoder(), } self.list_1D_array_methods = [ 'NgramsCountVectorizer', 'NgramsTfIdfVectorizer', 'WordNgramsTfIdfVectorizer', 'ngrams_hot_vectorizer', 'NgramsLDA', 'NMF', 'WordNMF', 'NgramsMultinomialMixture', 'NgramsMultinomialMixtureKMeans2', 'AdHocNgramsMultinomialMixture', 'AdHocIndependentPDF', 'GammaPoissonFactorization', 'OnlineGammaPoissonFactorization', 'WordOnlineGammaPoissonFactorization', 'OnlineGammaPoissonFactorization2', 'OnlineGammaPoissonFactorization3', 'OnlineGammaPoissonFactorization4', 'OnlineGammaPoissonFactorization_fast', 'MinHashEncoder', 'MinMeanMinHashEncoder', ]
def __init__( self, encoder_name, reduction_method=None, ngram_range=(2, 4), categories="auto", dtype=np.float64, handle_unknown="ignore", clf_type=None, n_components=None, ): self.ngram_range = ngram_range self.encoder_name = encoder_name self.categories = categories self.dtype = dtype self.clf_type = clf_type self.handle_unknown = handle_unknown self.reduction_method = reduction_method self.n_components = n_components self.encoders_dict = { "OneHotEncoder": OneHotEncoder(handle_unknown="ignore"), "OneHotEncoder-1": OneHotEncoderRemoveOne(handle_unknown="ignore"), "Categorical": None, "OneHotEncoderDense": OneHotEncoder(handle_unknown="ignore", sparse=False), "OneHotEncoderDense-1": OneHotEncoderRemoveOne(handle_unknown="ignore", sparse=False), "SimilarityEncoder": SimilarityEncoder(ngram_range=self.ngram_range, random_state=10), "NgramNaiveFisherKernel": NgramNaiveFisherKernel(ngram_range=self.ngram_range, random_state=10), "ngrams_hot_vectorizer": [], "NgramsCountVectorizer": CountVectorizer(analyzer="char", ngram_range=self.ngram_range), "NgramsTfIdfVectorizer": TfidfVectorizer(analyzer="char", ngram_range=self.ngram_range, smooth_idf=False), "WordNgramsTfIdfVectorizer": TfidfVectorizer(analyzer="word", ngram_range=(1, 1), smooth_idf=False), "TargetEncoder": TargetEncoder(clf_type=self.clf_type, handle_unknown="ignore"), "MDVEncoder": MDVEncoder(self.clf_type), "BackwardDifferenceEncoder": cat_enc.BackwardDifferenceEncoder(), "BinaryEncoder": cat_enc.BinaryEncoder(), "HashingEncoder": cat_enc.HashingEncoder(), "HelmertEncoder": cat_enc.HelmertEncoder(), "SumEncoder": cat_enc.SumEncoder(), "PolynomialEncoder": cat_enc.PolynomialEncoder(), "BaseNEncoder": cat_enc.BaseNEncoder(), "LeaveOneOutEncoder": cat_enc.LeaveOneOutEncoder(), "NgramsLDA": Pipeline([ ( "ngrams_count", CountVectorizer(analyzer="char", ngram_range=self.ngram_range), ), ( "LDA", LatentDirichletAllocation(n_components=self.n_components, learning_method="batch"), ), ]), "NMF": Pipeline([ ( "ngrams_count", CountVectorizer(analyzer="char", ngram_range=self.ngram_range), ), ("NMF", NMF(n_components=self.n_components)), ]), "WordNMF": Pipeline([ ("ngrams_count", CountVectorizer(analyzer="word", ngram_range=(1, 1))), ("NMF", NMF(n_components=self.n_components)), ]), "NgramsMultinomialMixture": NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10), "AdHocNgramsMultinomialMixture": AdHocNgramsMultinomialMixture(n_iters=0), "AdHocIndependentPDF": AdHocIndependentPDF(), "OnlineGammaPoissonFactorization": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, rho=0.99, r=None, tol=1e-4, random_state=18, init="k-means++", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=10, ), "OnlineGammaPoissonFactorization2": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init="k-means++", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20, ), "OnlineGammaPoissonFactorization3": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init="k-means", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20, ), "OnlineGammaPoissonFactorization4": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=None, rho=0.95, batch_size=256, tol=1e-4, random_state=18, init="k-means", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20, ), "WordOnlineGammaPoissonFactorization": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, tol=1e-4, random_state=18, init="k-means++", ngram_range=(1, 1), analizer="word", rescale_W=True, max_iter_e_step=10, ), "OnlineGammaPoissonFactorization_fast": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, ngram_range=(3, 3), max_iter=1, min_iter=1, tol=1e-4, random_state=18, init="k-means++", rescale_W=False, ), "MinHashEncoder": MinHashEncoder(n_components=self.n_components), "PretrainedFastText": PretrainedFastText(n_components=self.n_components), "PretrainedFastText_fr": PretrainedFastText(n_components=self.n_components, language="french"), "PretrainedFastText_hu": PretrainedFastText(n_components=self.n_components, language="hungarian"), None: FunctionTransformer(None, validate=True), "Passthrough": PasstroughEncoder(), } self.list_1D_array_methods = [ "NgramsCountVectorizer", "NgramsTfIdfVectorizer", "WordNgramsTfIdfVectorizer", "ngrams_hot_vectorizer", "NgramsLDA", "NMF", "WordNMF", "NgramsMultinomialMixture", "NgramsMultinomialMixtureKMeans2", "AdHocNgramsMultinomialMixture", "AdHocIndependentPDF", "GammaPoissonFactorization", "OnlineGammaPoissonFactorization", "WordOnlineGammaPoissonFactorization", "OnlineGammaPoissonFactorization2", "OnlineGammaPoissonFactorization3", "OnlineGammaPoissonFactorization4", "OnlineGammaPoissonFactorization_fast", "MinHashEncoder", "MinMeanMinHashEncoder", ]
string_columns = metadata_df.select_dtypes(include=['object']).columns # [(col, metadata_df[col].map(type).unique()) for col in string_columns] string_columns = string_columns.tolist() string_columns.remove('data') metadata_df['all'] = metadata_df[string_columns].apply( lambda row: row.str.cat(sep=' '), axis=1) vectorizer = CountVectorizer() corpus = [metadata_df.iloc[ii]['all'] for ii in range(metadata_df.shape[0])] bag_of_words = vectorizer.fit_transform(corpus) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()] words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) frequent_words = [w[0] for w in words_freq] print(' '.join(frequent_words[:100])) sdfdf se = SimilarityEncoder(similarity='ngram', handle_unknown='ignore') y = se.fit_transform(metadata_df.name) # XXX: need more features than 1 ... sdfdf # look at the metadata if True: metadata_df.to_csv('metadata.csv', encoding='utf-8')
def categorical_encoding(A, B, y_train, encoder, clf_type, n_jobs): '''Build the matrix of encoders. Given two arrays of strings to compare an a encoder, returns the corresponding encoder matrix of size len(A)xlen(B)''' if encoder == 'levenshtein-ratio_SimilarityEncoder': B = np.unique(B).reshape(-1, 1) encoder = SimilarityEncoder(similarity='levenshtein-ratio') encoder.fit(B) se = encoder.transform(A.reshape(-1, 1)) return se if encoder == 'one-hot_encoding': return one_hot_encoding(A, B) if encoder == 'one-hot_encoding_sparse': return sparse.csr_matrix(one_hot_encoding_sparse(A, B)) if encoder == 'jaccard_similarity': B = np.unique(B) warning = (('Warning: %s is not a well defined similarity ' + 'metric because two different values can have a ' + 'similarity of 1') % encoder) print(warning) unqA = np.unique(A) vlev = np.vectorize(dist.jaccard) # dvec = Parallel(n_jobs=n_jobs)(delayed(vlev)(a, B.reshape(1, -1)) # for a in unqA) dvec = [vlev(a, B.reshape(1, -1)) for a in unqA] ddict = {unqA[i]: dvec[i] for i in range(len(dvec))} dms = (ddict[a] for a in A) dm = np.vstack(dms) return 1 - dm if encoder == 'sorensen_similarity': B = np.unique(B) unqA = np.unique(A) vlev = np.vectorize(dist.sorensen) # dvec = Parallel(n_jobs=n_jobs)(delayed(vlev)(a, B.reshape(1, -1)) # for a in unqA) dvec = [vlev(a, B.reshape(1, -1)) for a in unqA] ddict = {unqA[i]: dvec[i] for i in range(len(dvec))} dms = (ddict[a] for a in A) dm = np.vstack(dms) return 1 - dm if encoder == 'jaro-winkler_SimilarityEncoder': B = np.unique(B).reshape(-1, 1) encoder = SimilarityEncoder(similarity='jaro-winkler') encoder.fit(B) se = encoder.transform(A.reshape(-1, 1)) return se if encoder[1:] == 'gram_SimilarityEncoder': n = int(encoder[0]) B = np.unique(B).reshape(-1, 1) encoder = SimilarityEncoder() encoder.fit(B) return encoder.transform(A.reshape(-1, 1)) if encoder[1:] == 'gram_similarity2': n = int(encoder[0]) B = np.unique(B) return ngram_similarity(A, B, n, sim_type='sim2') if encoder[1:] == 'gram_presence_fisher_kernel': n = int(encoder[0]) return ngram_similarity(A, B, n, sim_type='fisher_kernel') if encoder[1:] == 'gram_similarity2_1': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim2_1') return sm if encoder[1:] == 'gram_similarity2_2': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim2_2') return sm if encoder[1:] == 'gram_similarity3': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim3') return sm if encoder[1:] == 'gram_similarity3_2': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim3_2') return sm if encoder[1:] == 'gram_similarity4': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim4') return sm if encoder[1:] == 'gram_similarity5': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim5') return sm if encoder[1:] == 'gram_similarity6': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim6') return sm if encoder[1:] == 'gram_similarity7': n = int(encoder[0]) B = np.unique(B) sm = ngram_similarity(A, B, n, sim_type='sim7') return sm if encoder[1:] == 'grams_count_vectorizer': n = int(encoder[0]) B = np.unique(B) vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n)) vectorizer.fit(B) return vectorizer.transform(A) if encoder[1:] == 'grams_tfidf_vectorizer': n = int(encoder[0]) B = np.unique(B) vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(n, n), smooth_idf=False) vectorizer.fit(B) return vectorizer.transform(A) if encoder[1:] == 'grams_tf_vectorizer': n = int(encoder[0]) B = np.unique(B) vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(n, n), smooth_idf=False, use_idf=False) vectorizer.fit(B) return vectorizer.transform(A) if encoder[1:] == 'grams_hot_vectorizer': n = int(encoder[0]) B = np.unique(B) vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n)) vectorizer.fit(B) count_matrix1 = vectorizer.transform(A) return (count_matrix1 > 0).astype('float64') if encoder[1:] == 'grams_hot_vectorizer_tfidf': n = int(encoder[0]) B = np.unique(B) vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n)) presenceB = (vectorizer.fit_transform(B) > 0).astype('float64') presenceA = (vectorizer.transform(A) > 0).astype('float64') transformer = TfidfTransformer(smooth_idf=True) transformer.fit(presenceB) tfidfA = transformer.transform(presenceA) return tfidfA if encoder[1:] == 'grams_hashing': n = int(encoder[0]) hashingA = ngrams_hashing_vectorizer(A, n, 10000) return hashingA if encoder == 'TargetEncoder': encoder = TargetEncoder(clf_type=clf_type, handle_unknown='ignore') encoder.fit(B.reshape(-1, 1), y_train) return encoder.transform(A.reshape(-1, 1)) if encoder == 'MDVEncoder': return mdv_encoding(A, B, y_train, clf_type) if encoder == 'BackwardDifferenceEncoder': encoder = ce.BackwardDifferenceEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'BinaryEncoder': encoder = ce.BinaryEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'HashingEncoder': encoder = ce.HashingEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'HelmertEncoder': encoder = ce.HelmertEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'OneHotEncoder': encoder = ce.OneHotEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'OrdinalEncoder': encoder = ce.OrdinalEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'SumEncoder': encoder = ce.SumEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'PolynomialEncoder': encoder = ce.PolynomialEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'BaseNEncoder': encoder = ce.BaseNEncoder() encoder.fit(B) return encoder.transform(A) if encoder == 'LeaveOneOutEncoder': encoder = ce.LeaveOneOutEncoder() encoder.fit(B, y_train) return encoder.transform(A) else: message = 'Encoder %s has not been implemented yet.' % encoder return message