Ejemplo n.º 1
0
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold

from dirty_cat import datasets
from dirty_cat import SimilarityEncoder, TargetEncoder


# encoding methods
encoder_dict = {
    'one-hot': OneHotEncoder(handle_unknown='ignore'),
    'similarity': SimilarityEncoder(similarity='ngram',
                                    handle_unknown='ignore'),
    'target': TargetEncoder(handle_unknown='ignore'),
    'num': FunctionTransformer(None)
    }

data_file = datasets.fetch_employee_salaries()

for method in ['one-hot', 'target', 'similarity']:
    # Load the data
    df = pd.read_csv(data_file).astype(str)
    df['Current Annual Salary'] = [float(s[1:]) for s
                                   in df['Current Annual Salary']]
    df['Year First Hired'] = [int(s.split('/')[-1])
                              for s in df['Date First Hired']]

    target_column = 'Current Annual Salary'
    y = df[target_column].values.ravel()
Ejemplo n.º 2
0
 def __init__(
     self,
     encoder_name,
     reduction_method=None,
     ngram_range=(2, 4),
     categories="auto",
     dtype=np.float64,
     handle_unknown="ignore",
     clf_type=None,
     n_components=None,
 ):
     self.ngram_range = ngram_range
     self.encoder_name = encoder_name
     self.categories = categories
     self.dtype = dtype
     self.clf_type = clf_type
     self.handle_unknown = handle_unknown
     self.reduction_method = reduction_method
     self.n_components = n_components
     self.encoders_dict = {
         "OneHotEncoder":
         OneHotEncoder(handle_unknown="ignore"),
         "OneHotEncoder-1":
         OneHotEncoderRemoveOne(handle_unknown="ignore"),
         "Categorical":
         None,
         "OneHotEncoderDense":
         OneHotEncoder(handle_unknown="ignore", sparse=False),
         "OneHotEncoderDense-1":
         OneHotEncoderRemoveOne(handle_unknown="ignore", sparse=False),
         "SimilarityEncoder":
         SimilarityEncoder(ngram_range=self.ngram_range, random_state=10),
         "NgramNaiveFisherKernel":
         NgramNaiveFisherKernel(ngram_range=self.ngram_range,
                                random_state=10),
         "ngrams_hot_vectorizer": [],
         "NgramsCountVectorizer":
         CountVectorizer(analyzer="char", ngram_range=self.ngram_range),
         "NgramsTfIdfVectorizer":
         TfidfVectorizer(analyzer="char",
                         ngram_range=self.ngram_range,
                         smooth_idf=False),
         "WordNgramsTfIdfVectorizer":
         TfidfVectorizer(analyzer="word",
                         ngram_range=(1, 1),
                         smooth_idf=False),
         "TargetEncoder":
         TargetEncoder(clf_type=self.clf_type, handle_unknown="ignore"),
         "MDVEncoder":
         MDVEncoder(self.clf_type),
         "BackwardDifferenceEncoder":
         cat_enc.BackwardDifferenceEncoder(),
         "BinaryEncoder":
         cat_enc.BinaryEncoder(),
         "HashingEncoder":
         cat_enc.HashingEncoder(),
         "HelmertEncoder":
         cat_enc.HelmertEncoder(),
         "SumEncoder":
         cat_enc.SumEncoder(),
         "PolynomialEncoder":
         cat_enc.PolynomialEncoder(),
         "BaseNEncoder":
         cat_enc.BaseNEncoder(),
         "LeaveOneOutEncoder":
         cat_enc.LeaveOneOutEncoder(),
         "NgramsLDA":
         Pipeline([
             (
                 "ngrams_count",
                 CountVectorizer(analyzer="char",
                                 ngram_range=self.ngram_range),
             ),
             (
                 "LDA",
                 LatentDirichletAllocation(n_components=self.n_components,
                                           learning_method="batch"),
             ),
         ]),
         "NMF":
         Pipeline([
             (
                 "ngrams_count",
                 CountVectorizer(analyzer="char",
                                 ngram_range=self.ngram_range),
             ),
             ("NMF", NMF(n_components=self.n_components)),
         ]),
         "WordNMF":
         Pipeline([
             ("ngrams_count",
              CountVectorizer(analyzer="word", ngram_range=(1, 1))),
             ("NMF", NMF(n_components=self.n_components)),
         ]),
         "NgramsMultinomialMixture":
         NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10),
         "AdHocNgramsMultinomialMixture":
         AdHocNgramsMultinomialMixture(n_iters=0),
         "AdHocIndependentPDF":
         AdHocIndependentPDF(),
         "OnlineGammaPoissonFactorization":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             rho=0.99,
             r=None,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=10,
         ),
         "OnlineGammaPoissonFactorization2":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20,
         ),
         "OnlineGammaPoissonFactorization3":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init="k-means",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20,
         ),
         "OnlineGammaPoissonFactorization4":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=None,
             rho=0.95,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init="k-means",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20,
         ),
         "WordOnlineGammaPoissonFactorization":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             ngram_range=(1, 1),
             analizer="word",
             rescale_W=True,
             max_iter_e_step=10,
         ),
         "OnlineGammaPoissonFactorization_fast":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             ngram_range=(3, 3),
             max_iter=1,
             min_iter=1,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             rescale_W=False,
         ),
         "MinHashEncoder":
         MinHashEncoder(n_components=self.n_components),
         "PretrainedFastText":
         PretrainedFastText(n_components=self.n_components),
         "PretrainedFastText_fr":
         PretrainedFastText(n_components=self.n_components,
                            language="french"),
         "PretrainedFastText_hu":
         PretrainedFastText(n_components=self.n_components,
                            language="hungarian"),
         None:
         FunctionTransformer(None, validate=True),
         "Passthrough":
         PasstroughEncoder(),
     }
     self.list_1D_array_methods = [
         "NgramsCountVectorizer",
         "NgramsTfIdfVectorizer",
         "WordNgramsTfIdfVectorizer",
         "ngrams_hot_vectorizer",
         "NgramsLDA",
         "NMF",
         "WordNMF",
         "NgramsMultinomialMixture",
         "NgramsMultinomialMixtureKMeans2",
         "AdHocNgramsMultinomialMixture",
         "AdHocIndependentPDF",
         "GammaPoissonFactorization",
         "OnlineGammaPoissonFactorization",
         "WordOnlineGammaPoissonFactorization",
         "OnlineGammaPoissonFactorization2",
         "OnlineGammaPoissonFactorization3",
         "OnlineGammaPoissonFactorization4",
         "OnlineGammaPoissonFactorization_fast",
         "MinHashEncoder",
         "MinMeanMinHashEncoder",
     ]
 def __init__(self,
              encoder_name,
              reduction_method=None,
              ngram_range=(2, 4),
              categories='auto',
              dtype=np.float64,
              handle_unknown='ignore',
              clf_type=None,
              n_components=None):
     self.ngram_range = ngram_range
     self.encoder_name = encoder_name
     self.categories = categories
     self.dtype = dtype
     self.clf_type = clf_type
     self.handle_unknown = handle_unknown
     self.reduction_method = reduction_method
     self.n_components = n_components
     self.encoders_dict = {
         'OneHotEncoder':
         OneHotEncoder(handle_unknown='ignore'),
         'OneHotEncoder-1':
         OneHotEncoderRemoveOne(handle_unknown='ignore'),
         'Categorical':
         None,
         'OneHotEncoderDense':
         OneHotEncoder(handle_unknown='ignore', sparse=False),
         'OneHotEncoderDense-1':
         OneHotEncoderRemoveOne(handle_unknown='ignore', sparse=False),
         'SimilarityEncoder':
         SimilarityEncoder(ngram_range=self.ngram_range, random_state=10),
         'NgramNaiveFisherKernel':
         NgramNaiveFisherKernel(ngram_range=self.ngram_range,
                                random_state=10),
         'ngrams_hot_vectorizer': [],
         'NgramsCountVectorizer':
         CountVectorizer(analyzer='char', ngram_range=self.ngram_range),
         'NgramsTfIdfVectorizer':
         TfidfVectorizer(analyzer='char',
                         ngram_range=self.ngram_range,
                         smooth_idf=False),
         'WordNgramsTfIdfVectorizer':
         TfidfVectorizer(analyzer='word',
                         ngram_range=(1, 1),
                         smooth_idf=False),
         'TargetEncoder':
         TargetEncoder(clf_type=self.clf_type, handle_unknown='ignore'),
         'MDVEncoder':
         MDVEncoder(self.clf_type),
         'BackwardDifferenceEncoder':
         cat_enc.BackwardDifferenceEncoder(),
         'BinaryEncoder':
         cat_enc.BinaryEncoder(),
         'HashingEncoder':
         cat_enc.HashingEncoder(),
         'HelmertEncoder':
         cat_enc.HelmertEncoder(),
         'SumEncoder':
         cat_enc.SumEncoder(),
         'PolynomialEncoder':
         cat_enc.PolynomialEncoder(),
         'BaseNEncoder':
         cat_enc.BaseNEncoder(),
         'LeaveOneOutEncoder':
         cat_enc.LeaveOneOutEncoder(),
         'NgramsLDA':
         Pipeline([
             ('ngrams_count',
              CountVectorizer(analyzer='char',
                              ngram_range=self.ngram_range)),
             (
                 'LDA',
                 LatentDirichletAllocation(n_components=self.n_components,
                                           learning_method='batch'),
             )
         ]),
         'NMF':
         Pipeline([('ngrams_count',
                    CountVectorizer(analyzer='char',
                                    ngram_range=self.ngram_range)),
                   ('NMF', NMF(n_components=self.n_components))]),
         'WordNMF':
         Pipeline([('ngrams_count',
                    CountVectorizer(analyzer='word', ngram_range=(1, 1))),
                   ('NMF', NMF(n_components=self.n_components))]),
         'NgramsMultinomialMixture':
         NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10),
         'AdHocNgramsMultinomialMixture':
         AdHocNgramsMultinomialMixture(n_iters=0),
         'AdHocIndependentPDF':
         AdHocIndependentPDF(),
         'OnlineGammaPoissonFactorization':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             rho=.99,
             r=None,
             tol=1e-4,
             random_state=18,
             init='k-means++',
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=10),
         'OnlineGammaPoissonFactorization2':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init='k-means++',
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20),
         'OnlineGammaPoissonFactorization3':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init='k-means',
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20),
         'OnlineGammaPoissonFactorization4':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=None,
             rho=.95,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init='k-means',
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20),
         'WordOnlineGammaPoissonFactorization':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=.3,
             tol=1e-4,
             random_state=18,
             init='k-means++',
             ngram_range=(1, 1),
             analizer='word',
             rescale_W=True,
             max_iter_e_step=10),
         'OnlineGammaPoissonFactorization_fast':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=.3,
             ngram_range=(3, 3),
             max_iter=1,
             min_iter=1,
             tol=1e-4,
             random_state=18,
             init='k-means++',
             rescale_W=False),
         'MinHashEncoder':
         MinHashEncoder(n_components=self.n_components),
         'PretrainedFastText':
         PretrainedFastText(n_components=self.n_components),
         'PretrainedFastText_fr':
         PretrainedFastText(n_components=self.n_components,
                            language='french'),
         'PretrainedFastText_hu':
         PretrainedFastText(n_components=self.n_components,
                            language='hungarian'),
         None:
         FunctionTransformer(None, validate=True),
         'Passthrough':
         PasstroughEncoder(),
     }
     self.list_1D_array_methods = [
         'NgramsCountVectorizer',
         'NgramsTfIdfVectorizer',
         'WordNgramsTfIdfVectorizer',
         'ngrams_hot_vectorizer',
         'NgramsLDA',
         'NMF',
         'WordNMF',
         'NgramsMultinomialMixture',
         'NgramsMultinomialMixtureKMeans2',
         'AdHocNgramsMultinomialMixture',
         'AdHocIndependentPDF',
         'GammaPoissonFactorization',
         'OnlineGammaPoissonFactorization',
         'WordOnlineGammaPoissonFactorization',
         'OnlineGammaPoissonFactorization2',
         'OnlineGammaPoissonFactorization3',
         'OnlineGammaPoissonFactorization4',
         'OnlineGammaPoissonFactorization_fast',
         'MinHashEncoder',
         'MinMeanMinHashEncoder',
     ]
Ejemplo n.º 4
0
def categorical_encoding(A, B, y_train, encoder, clf_type, n_jobs):
    '''Build the matrix of encoders.
    Given two arrays of strings to compare an a encoder, returns the
    corresponding encoder matrix of size len(A)xlen(B)'''

    if encoder == 'levenshtein-ratio_SimilarityEncoder':
        B = np.unique(B).reshape(-1, 1)
        encoder = SimilarityEncoder(similarity='levenshtein-ratio')
        encoder.fit(B)
        se = encoder.transform(A.reshape(-1, 1))
        return se
    if encoder == 'one-hot_encoding':
        return one_hot_encoding(A, B)
    if encoder == 'one-hot_encoding_sparse':
        return sparse.csr_matrix(one_hot_encoding_sparse(A, B))
    if encoder == 'jaccard_similarity':
        B = np.unique(B)
        warning = (('Warning: %s is not a well defined similarity ' +
                    'metric because two different values can have a ' +
                    'similarity of 1') % encoder)
        print(warning)
        unqA = np.unique(A)
        vlev = np.vectorize(dist.jaccard)
        # dvec = Parallel(n_jobs=n_jobs)(delayed(vlev)(a, B.reshape(1, -1))
        #                           for a in unqA)
        dvec = [vlev(a, B.reshape(1, -1)) for a in unqA]
        ddict = {unqA[i]: dvec[i] for i in range(len(dvec))}
        dms = (ddict[a] for a in A)
        dm = np.vstack(dms)
        return 1 - dm
    if encoder == 'sorensen_similarity':
        B = np.unique(B)
        unqA = np.unique(A)
        vlev = np.vectorize(dist.sorensen)
        # dvec = Parallel(n_jobs=n_jobs)(delayed(vlev)(a, B.reshape(1, -1))
        #                           for a in unqA)
        dvec = [vlev(a, B.reshape(1, -1)) for a in unqA]
        ddict = {unqA[i]: dvec[i] for i in range(len(dvec))}
        dms = (ddict[a] for a in A)
        dm = np.vstack(dms)
        return 1 - dm
    if encoder == 'jaro-winkler_SimilarityEncoder':
        B = np.unique(B).reshape(-1, 1)
        encoder = SimilarityEncoder(similarity='jaro-winkler')
        encoder.fit(B)
        se = encoder.transform(A.reshape(-1, 1))
        return se
    if encoder[1:] == 'gram_SimilarityEncoder':
        n = int(encoder[0])
        B = np.unique(B).reshape(-1, 1)
        encoder = SimilarityEncoder()
        encoder.fit(B)
        return encoder.transform(A.reshape(-1, 1))
    if encoder[1:] == 'gram_similarity2':
        n = int(encoder[0])
        B = np.unique(B)
        return ngram_similarity(A, B, n, sim_type='sim2')
    if encoder[1:] == 'gram_presence_fisher_kernel':
        n = int(encoder[0])
        return ngram_similarity(A, B, n, sim_type='fisher_kernel')
    if encoder[1:] == 'gram_similarity2_1':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim2_1')
        return sm
    if encoder[1:] == 'gram_similarity2_2':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim2_2')
        return sm
    if encoder[1:] == 'gram_similarity3':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim3')
        return sm
    if encoder[1:] == 'gram_similarity3_2':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim3_2')
        return sm
    if encoder[1:] == 'gram_similarity4':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim4')
        return sm
    if encoder[1:] == 'gram_similarity5':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim5')
        return sm
    if encoder[1:] == 'gram_similarity6':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim6')
        return sm
    if encoder[1:] == 'gram_similarity7':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim7')
        return sm
    if encoder[1:] == 'grams_count_vectorizer':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n))
        vectorizer.fit(B)
        return vectorizer.transform(A)
    if encoder[1:] == 'grams_tfidf_vectorizer':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = TfidfVectorizer(analyzer='char',
                                     ngram_range=(n, n),
                                     smooth_idf=False)
        vectorizer.fit(B)
        return vectorizer.transform(A)
    if encoder[1:] == 'grams_tf_vectorizer':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = TfidfVectorizer(analyzer='char',
                                     ngram_range=(n, n),
                                     smooth_idf=False,
                                     use_idf=False)
        vectorizer.fit(B)
        return vectorizer.transform(A)
    if encoder[1:] == 'grams_hot_vectorizer':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n))
        vectorizer.fit(B)
        count_matrix1 = vectorizer.transform(A)
        return (count_matrix1 > 0).astype('float64')
    if encoder[1:] == 'grams_hot_vectorizer_tfidf':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n))
        presenceB = (vectorizer.fit_transform(B) > 0).astype('float64')
        presenceA = (vectorizer.transform(A) > 0).astype('float64')
        transformer = TfidfTransformer(smooth_idf=True)
        transformer.fit(presenceB)
        tfidfA = transformer.transform(presenceA)
        return tfidfA
    if encoder[1:] == 'grams_hashing':
        n = int(encoder[0])
        hashingA = ngrams_hashing_vectorizer(A, n, 10000)
        return hashingA
    if encoder == 'TargetEncoder':
        encoder = TargetEncoder(clf_type=clf_type, handle_unknown='ignore')
        encoder.fit(B.reshape(-1, 1), y_train)
        return encoder.transform(A.reshape(-1, 1))
    if encoder == 'MDVEncoder':
        return mdv_encoding(A, B, y_train, clf_type)
    if encoder == 'BackwardDifferenceEncoder':
        encoder = ce.BackwardDifferenceEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'BinaryEncoder':
        encoder = ce.BinaryEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'HashingEncoder':
        encoder = ce.HashingEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'HelmertEncoder':
        encoder = ce.HelmertEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'OneHotEncoder':
        encoder = ce.OneHotEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'OrdinalEncoder':
        encoder = ce.OrdinalEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'SumEncoder':
        encoder = ce.SumEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'PolynomialEncoder':
        encoder = ce.PolynomialEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'BaseNEncoder':
        encoder = ce.BaseNEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'LeaveOneOutEncoder':
        encoder = ce.LeaveOneOutEncoder()
        encoder.fit(B, y_train)
        return encoder.transform(A)
    else:
        message = 'Encoder %s has not been implemented yet.' % encoder
        return message