Beispiel #1
0
def preprocess(df, encode, categorize, preran):
    y = df["area"]
    X = df.drop(
        ["area"],
        axis=1,
    )

    X.info()

    if encode:
        encode_columns = []
        n_prototypes = 5
        if not preran:
            enc = SimilarityEncoder(similarity="ngram",
                                    categories="k-means",
                                    n_prototypes=n_prototypes)
            enc.fit(X[encode_columns].values)
            pd.to_pickle(enc, "encoders/similarity_encoder.pickle")
        else:
            enc = pd.read_pickle("encoders/similarity_encoder.pickle")
        transformed_values = enc.transform(X[encode_columns].values)

        transformed_values = pd.DataFrame(transformed_values, index=X.index)
        transformed_columns = []
        for col in encode_columns:
            for i in range(0, n_prototypes):
                transformed_columns.append(col + "_" + str(i))
        transformed_values.columns = transformed_columns
        X = pd.concat([X, transformed_values], axis=1)
        X = X.drop(encode_columns, axis=1)

    if categorize:
        obj_cols = X.select_dtypes("object").columns
        X[obj_cols] = X[obj_cols].astype("category")
    return X, y
Beispiel #2
0
def prune_characters(char_occ_dict, threshold=0.1):
    from dirty_cat import SimilarityEncoder
    from sklearn.preprocessing import minmax_scale
    from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
    from scipy.spatial.distance import squareform
    simenc = SimilarityEncoder(similarity='jaro-winkler')
    transf = simenc.fit_transform(np.array(sorted(char_occ_dict.keys())).reshape(-1, 1))
    corr_dist = minmax_scale(-transf)
    dense_distance = squareform(corr_dist, checks=False)
    Z = linkage(dense_distance, 'average', optimal_ordering=True)
    return get_merged_characters(Z, char_occ_dict, threshold=threshold)
    def fit(self,
            df,
            similarity="ngram",
            categories="most_frequent",
            n_prototypes=100):
        # Initialaze the similarity encoder
        self.similarity_encoder = SimilarityEncoder(similarity=similarity,
                                                    dtype=np.float32,
                                                    categories=categories,
                                                    n_prototypes=n_prototypes,
                                                    random_state=1006)

        # Fit the similarity encoder
        self.similarity_encoder.fit(df[self.col_name].values.reshape(-1, 1))
Beispiel #4
0
def benchmark(strat='k-means',
              limit=50000,
              n_proto=100,
              hash_dim=None,
              ngram_range=(3, 3)):
    df = dfr[:limit].copy()
    df = df.dropna(axis=0)
    df = df.reset_index()

    y = df['Violation Type']

    if strat == 'k-means':
        sim_enc = SimilarityEncoder(similarity='ngram',
                                    ngram_range=ngram_range,
                                    categories='k-means',
                                    hashing_dim=hash_dim,
                                    n_prototypes=n_proto,
                                    random_state=3498)
    else:
        sim_enc = SimilarityEncoder(similarity='ngram',
                                    ngram_range=ngram_range,
                                    categories='most_frequent',
                                    hashing_dim=hash_dim,
                                    n_prototypes=n_proto,
                                    random_state=3498)

    column_trans = ColumnTransformer(transformers=transformers +
                                     [('sim_enc', sim_enc, ['Description'])],
                                     remainder='drop')

    t0 = time()
    X = column_trans.fit_transform(df)
    t1 = time()
    t_score_1 = t1 - t0

    model = pipeline.Pipeline([('logistic', linear_model.LogisticRegression())
                               ])

    t0 = time()
    m_score = model_selection.cross_val_score(model, X, y, cv=20)
    t1 = time()
    t_score_2 = t1 - t0
    return t_score_1, m_score, t_score_2
class FitSimilarityEncoder:
    def __init__(self, col_name):
        self.col_name = col_name

    def fit(self,
            df,
            similarity="ngram",
            categories="most_frequent",
            n_prototypes=100):
        # Initialaze the similarity encoder
        self.similarity_encoder = SimilarityEncoder(similarity=similarity,
                                                    dtype=np.float32,
                                                    categories=categories,
                                                    n_prototypes=n_prototypes,
                                                    random_state=1006)

        # Fit the similarity encoder
        self.similarity_encoder.fit(df[self.col_name].values.reshape(-1, 1))

    def transform(self, df):
        return self.similarity_encoder.transform(
            df[self.col_name].values.reshape(-1, 1))
Beispiel #6
0
def similarity_encode(X, encode_columns, n_prototypes, train, drop_original):
    X = X.copy()
    if train:
        enc = SimilarityEncoder(similarity="ngram",
                                categories="k-means",
                                n_prototypes=n_prototypes)
        enc.fit(X[encode_columns].values)
        Path("encoders").mkdir(exist_ok=True)
        pd.to_pickle(enc, "encoders/similarity_encoder.pickle")
    else:
        enc = pd.read_pickle("encoders/similarity_encoder.pickle")
    transformed_values = enc.transform(X[encode_columns].values)

    transformed_values = pd.DataFrame(transformed_values, index=X.index)
    transformed_columns = []
    for col in encode_columns:
        for i in range(0, n_prototypes):
            transformed_columns.append(col + "_" + str(i))
    transformed_values.columns = transformed_columns
    X = pd.concat([X, transformed_values], axis=1)
    if drop_original:
        X = X.drop(encode_columns, axis=1)
    return X
###############################################################################
# As we will see, SimilarityEncoder takes a while on such data.


###############################################################################
# SimilarityEncoder with default options
# --------------------------------------
#
# Let us build our vectorizer, using a ColumnTransformer to combine
# one-hot encoding and similarity encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from dirty_cat import SimilarityEncoder

sim_enc = SimilarityEncoder(similarity='ngram')

transformers = [
    ('one_hot', OneHotEncoder(sparse=False, handle_unknown='ignore'), clean_columns),
]

column_trans = ColumnTransformer(
    transformers=transformers + [('sim_enc', sim_enc, dirty_columns)],
    remainder='drop')

t0 = time()
X = column_trans.fit_transform(df)
t1 = time()
print('Time to vectorize: %s' % (t1 - t0))

###############################################################################
Beispiel #8
0
        "EmploymentStatus",
        "DateofTermination",
        "LastPerformanceReview_Date",
        "EmpStatusID",
        "TermReason",
    ],
    axis=1,
)

X.info()
date_cols = X.select_dtypes("datetime")
for col in date_cols:
    X = encode_dates(X, col)

encode_columns = ["Employee_Name", "Position", "ManagerName"]
enc = SimilarityEncoder(similarity="ngram", categories="k-means", n_prototypes=4)
for col in encode_columns:
    transformed_values = enc.fit_transform(X[col].values.reshape(-1, 1))
    transformed_values = pd.DataFrame(transformed_values, index=X.index)
    transformed_values.columns = [f"{col}_" + str(num) for num in transformed_values]
    X = pd.concat([X, transformed_values], axis=1)
    X = X.drop(col, axis=1)

obj_cols = X.select_dtypes("object").columns
X[obj_cols] = X[obj_cols].astype("category")


SEED = 0
SAMPLE_SIZE = 5000

Xt, Xv, yt, yv = train_test_split(
columns_names = df.columns

###############################################################################
# Estimators construction
# -----------------------
# Our input is categorical, thus needs to be encoded. As observations often
# consist in variations around a few concepts (for instance,
# :code:`'Amlodipine Besylate'` and
# :code:`'Amlodipine besylate and atorvastatin calcium'`
# have one ingredient in common), we need an encoding able to
# capture similarities between observations.

from dirty_cat import SimilarityEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
similarity_encoder = SimilarityEncoder(similarity='ngram')

###############################################################################
# Two other columns are used to predict the output: ``DOSAGEFORMNAME`` and
# ``ROUTENAME``. They are both categorical and can be encoded with a
# |OneHotEncoder|. We use a |ColumnTransformer| to stack the |OneHotEncoder|
# and the |SE|.  We can now choose a kernel method, for instance a |SVC|, to
# fit the encoded inputs.
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

column_transformer = make_column_transformer(
    (similarity_encoder, ['NONPROPRIETARYNAME']),
    (OneHotEncoder(handle_unknown='ignore'), ['DOSAGEFORMNAME', 'ROUTENAME']),
    sparse_threshold=1)
Beispiel #10
0
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold

from dirty_cat import datasets
from dirty_cat import SimilarityEncoder, TargetEncoder


# encoding methods
encoder_dict = {
    'one-hot': OneHotEncoder(handle_unknown='ignore'),
    'similarity': SimilarityEncoder(similarity='ngram',
                                    handle_unknown='ignore'),
    'target': TargetEncoder(handle_unknown='ignore'),
    'num': FunctionTransformer(None)
    }

data_file = datasets.fetch_employee_salaries()

for method in ['one-hot', 'target', 'similarity']:
    # Load the data
    df = pd.read_csv(data_file).astype(str)
    df['Current Annual Salary'] = [float(s[1:]) for s
                                   in df['Current Annual Salary']]
    df['Year First Hired'] = [int(s.split('/')[-1])
                              for s in df['Date First Hired']]

    target_column = 'Current Annual Salary'
Beispiel #11
0
# -----------------------
#
# The one-hot encoder is actually not well suited to the 'Employee
# Position Title' column, as this columns contains 400 different entries:
import numpy as np
np.unique(y)

# %%
# We will now experiment with encoders specially made for handling
# dirty columns
from dirty_cat import SimilarityEncoder, TargetEncoder, MinHashEncoder,\
    GapEncoder

encoders = {
    'one-hot': one_hot,
    'similarity': SimilarityEncoder(similarity='ngram'),
    'target': TargetEncoder(handle_unknown='ignore'),
    'minhash': MinHashEncoder(n_components=100),
    'gap': GapEncoder(n_components=100),
}

# %%
# We now loop over the different encoding methods,
# instantiate a new |Pipeline| each time, fit it
# and store the returned cross-validation score:

from sklearn.model_selection import cross_val_score

all_scores = dict()

for name, method in encoders.items():
Beispiel #12
0
# employee's position title
# data
values = data[['Employee Position Title', 'Gender', 'Current Annual Salary']]

#########################################################################
# String similarity between entries
# -------------------------------------------------
#
# That's where our encoders get into play. In order to robustly
# embed dirty semantic data, the SimilarityEncoder creates a similarity
# matrix based on the 3-gram structure of the data.
sorted_values = values['Employee Position Title'].sort_values().unique()

from dirty_cat import SimilarityEncoder

similarity_encoder = SimilarityEncoder(similarity='ngram')
transformed_values = similarity_encoder.fit_transform(
    sorted_values.reshape(-1, 1))

#########################################################################
# Plotting the new representation using multi-dimensional scaling
# ................................................................
#
# lets now plot a couple points at random using a low-dimensional representation
# to get an intuition of what the similarity encoder is doing:
from sklearn.manifold import MDS

mds = MDS(dissimilarity='precomputed', n_init=10, random_state=42)
two_dim_data = mds.fit_transform(1 -
                                 transformed_values)  # transformed values lie
# in the 0-1 range, so 1-transformed_value yields a positive dissimilarity matrix
 def __init__(self,
              encoder_name,
              reduction_method=None,
              ngram_range=(2, 4),
              categories='auto',
              dtype=np.float64,
              handle_unknown='ignore',
              clf_type=None,
              n_components=None):
     self.ngram_range = ngram_range
     self.encoder_name = encoder_name
     self.categories = categories
     self.dtype = dtype
     self.clf_type = clf_type
     self.handle_unknown = handle_unknown
     self.reduction_method = reduction_method
     self.n_components = n_components
     self.encoders_dict = {
         'OneHotEncoder':
         OneHotEncoder(handle_unknown='ignore'),
         'OneHotEncoder-1':
         OneHotEncoderRemoveOne(handle_unknown='ignore'),
         'Categorical':
         None,
         'OneHotEncoderDense':
         OneHotEncoder(handle_unknown='ignore', sparse=False),
         'OneHotEncoderDense-1':
         OneHotEncoderRemoveOne(handle_unknown='ignore', sparse=False),
         'SimilarityEncoder':
         SimilarityEncoder(ngram_range=self.ngram_range, random_state=10),
         'NgramNaiveFisherKernel':
         NgramNaiveFisherKernel(ngram_range=self.ngram_range,
                                random_state=10),
         'ngrams_hot_vectorizer': [],
         'NgramsCountVectorizer':
         CountVectorizer(analyzer='char', ngram_range=self.ngram_range),
         'NgramsTfIdfVectorizer':
         TfidfVectorizer(analyzer='char',
                         ngram_range=self.ngram_range,
                         smooth_idf=False),
         'WordNgramsTfIdfVectorizer':
         TfidfVectorizer(analyzer='word',
                         ngram_range=(1, 1),
                         smooth_idf=False),
         'TargetEncoder':
         TargetEncoder(clf_type=self.clf_type, handle_unknown='ignore'),
         'MDVEncoder':
         MDVEncoder(self.clf_type),
         'BackwardDifferenceEncoder':
         cat_enc.BackwardDifferenceEncoder(),
         'BinaryEncoder':
         cat_enc.BinaryEncoder(),
         'HashingEncoder':
         cat_enc.HashingEncoder(),
         'HelmertEncoder':
         cat_enc.HelmertEncoder(),
         'SumEncoder':
         cat_enc.SumEncoder(),
         'PolynomialEncoder':
         cat_enc.PolynomialEncoder(),
         'BaseNEncoder':
         cat_enc.BaseNEncoder(),
         'LeaveOneOutEncoder':
         cat_enc.LeaveOneOutEncoder(),
         'NgramsLDA':
         Pipeline([
             ('ngrams_count',
              CountVectorizer(analyzer='char',
                              ngram_range=self.ngram_range)),
             (
                 'LDA',
                 LatentDirichletAllocation(n_components=self.n_components,
                                           learning_method='batch'),
             )
         ]),
         'NMF':
         Pipeline([('ngrams_count',
                    CountVectorizer(analyzer='char',
                                    ngram_range=self.ngram_range)),
                   ('NMF', NMF(n_components=self.n_components))]),
         'WordNMF':
         Pipeline([('ngrams_count',
                    CountVectorizer(analyzer='word', ngram_range=(1, 1))),
                   ('NMF', NMF(n_components=self.n_components))]),
         'NgramsMultinomialMixture':
         NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10),
         'AdHocNgramsMultinomialMixture':
         AdHocNgramsMultinomialMixture(n_iters=0),
         'AdHocIndependentPDF':
         AdHocIndependentPDF(),
         'OnlineGammaPoissonFactorization':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             rho=.99,
             r=None,
             tol=1e-4,
             random_state=18,
             init='k-means++',
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=10),
         'OnlineGammaPoissonFactorization2':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init='k-means++',
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20),
         'OnlineGammaPoissonFactorization3':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init='k-means',
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20),
         'OnlineGammaPoissonFactorization4':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=None,
             rho=.95,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init='k-means',
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20),
         'WordOnlineGammaPoissonFactorization':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=.3,
             tol=1e-4,
             random_state=18,
             init='k-means++',
             ngram_range=(1, 1),
             analizer='word',
             rescale_W=True,
             max_iter_e_step=10),
         'OnlineGammaPoissonFactorization_fast':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=.3,
             ngram_range=(3, 3),
             max_iter=1,
             min_iter=1,
             tol=1e-4,
             random_state=18,
             init='k-means++',
             rescale_W=False),
         'MinHashEncoder':
         MinHashEncoder(n_components=self.n_components),
         'PretrainedFastText':
         PretrainedFastText(n_components=self.n_components),
         'PretrainedFastText_fr':
         PretrainedFastText(n_components=self.n_components,
                            language='french'),
         'PretrainedFastText_hu':
         PretrainedFastText(n_components=self.n_components,
                            language='hungarian'),
         None:
         FunctionTransformer(None, validate=True),
         'Passthrough':
         PasstroughEncoder(),
     }
     self.list_1D_array_methods = [
         'NgramsCountVectorizer',
         'NgramsTfIdfVectorizer',
         'WordNgramsTfIdfVectorizer',
         'ngrams_hot_vectorizer',
         'NgramsLDA',
         'NMF',
         'WordNMF',
         'NgramsMultinomialMixture',
         'NgramsMultinomialMixtureKMeans2',
         'AdHocNgramsMultinomialMixture',
         'AdHocIndependentPDF',
         'GammaPoissonFactorization',
         'OnlineGammaPoissonFactorization',
         'WordOnlineGammaPoissonFactorization',
         'OnlineGammaPoissonFactorization2',
         'OnlineGammaPoissonFactorization3',
         'OnlineGammaPoissonFactorization4',
         'OnlineGammaPoissonFactorization_fast',
         'MinHashEncoder',
         'MinMeanMinHashEncoder',
     ]
Beispiel #14
0
 def __init__(
     self,
     encoder_name,
     reduction_method=None,
     ngram_range=(2, 4),
     categories="auto",
     dtype=np.float64,
     handle_unknown="ignore",
     clf_type=None,
     n_components=None,
 ):
     self.ngram_range = ngram_range
     self.encoder_name = encoder_name
     self.categories = categories
     self.dtype = dtype
     self.clf_type = clf_type
     self.handle_unknown = handle_unknown
     self.reduction_method = reduction_method
     self.n_components = n_components
     self.encoders_dict = {
         "OneHotEncoder":
         OneHotEncoder(handle_unknown="ignore"),
         "OneHotEncoder-1":
         OneHotEncoderRemoveOne(handle_unknown="ignore"),
         "Categorical":
         None,
         "OneHotEncoderDense":
         OneHotEncoder(handle_unknown="ignore", sparse=False),
         "OneHotEncoderDense-1":
         OneHotEncoderRemoveOne(handle_unknown="ignore", sparse=False),
         "SimilarityEncoder":
         SimilarityEncoder(ngram_range=self.ngram_range, random_state=10),
         "NgramNaiveFisherKernel":
         NgramNaiveFisherKernel(ngram_range=self.ngram_range,
                                random_state=10),
         "ngrams_hot_vectorizer": [],
         "NgramsCountVectorizer":
         CountVectorizer(analyzer="char", ngram_range=self.ngram_range),
         "NgramsTfIdfVectorizer":
         TfidfVectorizer(analyzer="char",
                         ngram_range=self.ngram_range,
                         smooth_idf=False),
         "WordNgramsTfIdfVectorizer":
         TfidfVectorizer(analyzer="word",
                         ngram_range=(1, 1),
                         smooth_idf=False),
         "TargetEncoder":
         TargetEncoder(clf_type=self.clf_type, handle_unknown="ignore"),
         "MDVEncoder":
         MDVEncoder(self.clf_type),
         "BackwardDifferenceEncoder":
         cat_enc.BackwardDifferenceEncoder(),
         "BinaryEncoder":
         cat_enc.BinaryEncoder(),
         "HashingEncoder":
         cat_enc.HashingEncoder(),
         "HelmertEncoder":
         cat_enc.HelmertEncoder(),
         "SumEncoder":
         cat_enc.SumEncoder(),
         "PolynomialEncoder":
         cat_enc.PolynomialEncoder(),
         "BaseNEncoder":
         cat_enc.BaseNEncoder(),
         "LeaveOneOutEncoder":
         cat_enc.LeaveOneOutEncoder(),
         "NgramsLDA":
         Pipeline([
             (
                 "ngrams_count",
                 CountVectorizer(analyzer="char",
                                 ngram_range=self.ngram_range),
             ),
             (
                 "LDA",
                 LatentDirichletAllocation(n_components=self.n_components,
                                           learning_method="batch"),
             ),
         ]),
         "NMF":
         Pipeline([
             (
                 "ngrams_count",
                 CountVectorizer(analyzer="char",
                                 ngram_range=self.ngram_range),
             ),
             ("NMF", NMF(n_components=self.n_components)),
         ]),
         "WordNMF":
         Pipeline([
             ("ngrams_count",
              CountVectorizer(analyzer="word", ngram_range=(1, 1))),
             ("NMF", NMF(n_components=self.n_components)),
         ]),
         "NgramsMultinomialMixture":
         NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10),
         "AdHocNgramsMultinomialMixture":
         AdHocNgramsMultinomialMixture(n_iters=0),
         "AdHocIndependentPDF":
         AdHocIndependentPDF(),
         "OnlineGammaPoissonFactorization":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             rho=0.99,
             r=None,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=10,
         ),
         "OnlineGammaPoissonFactorization2":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20,
         ),
         "OnlineGammaPoissonFactorization3":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init="k-means",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20,
         ),
         "OnlineGammaPoissonFactorization4":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=None,
             rho=0.95,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init="k-means",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20,
         ),
         "WordOnlineGammaPoissonFactorization":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             ngram_range=(1, 1),
             analizer="word",
             rescale_W=True,
             max_iter_e_step=10,
         ),
         "OnlineGammaPoissonFactorization_fast":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             ngram_range=(3, 3),
             max_iter=1,
             min_iter=1,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             rescale_W=False,
         ),
         "MinHashEncoder":
         MinHashEncoder(n_components=self.n_components),
         "PretrainedFastText":
         PretrainedFastText(n_components=self.n_components),
         "PretrainedFastText_fr":
         PretrainedFastText(n_components=self.n_components,
                            language="french"),
         "PretrainedFastText_hu":
         PretrainedFastText(n_components=self.n_components,
                            language="hungarian"),
         None:
         FunctionTransformer(None, validate=True),
         "Passthrough":
         PasstroughEncoder(),
     }
     self.list_1D_array_methods = [
         "NgramsCountVectorizer",
         "NgramsTfIdfVectorizer",
         "WordNgramsTfIdfVectorizer",
         "ngrams_hot_vectorizer",
         "NgramsLDA",
         "NMF",
         "WordNMF",
         "NgramsMultinomialMixture",
         "NgramsMultinomialMixtureKMeans2",
         "AdHocNgramsMultinomialMixture",
         "AdHocIndependentPDF",
         "GammaPoissonFactorization",
         "OnlineGammaPoissonFactorization",
         "WordOnlineGammaPoissonFactorization",
         "OnlineGammaPoissonFactorization2",
         "OnlineGammaPoissonFactorization3",
         "OnlineGammaPoissonFactorization4",
         "OnlineGammaPoissonFactorization_fast",
         "MinHashEncoder",
         "MinMeanMinHashEncoder",
     ]
Beispiel #15
0
string_columns = metadata_df.select_dtypes(include=['object']).columns
# [(col, metadata_df[col].map(type).unique()) for col in string_columns]
string_columns = string_columns.tolist()
string_columns.remove('data')
metadata_df['all'] = metadata_df[string_columns].apply(
    lambda row: row.str.cat(sep=' '), axis=1)

vectorizer = CountVectorizer()

corpus = [metadata_df.iloc[ii]['all'] for ii in range(metadata_df.shape[0])]
bag_of_words = vectorizer.fit_transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx])
              for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
frequent_words = [w[0] for w in words_freq]

print(' '.join(frequent_words[:100]))

sdfdf

se = SimilarityEncoder(similarity='ngram', handle_unknown='ignore')
y = se.fit_transform(metadata_df.name)  # XXX: need more features than 1 ...

sdfdf

# look at the metadata
if True:
    metadata_df.to_csv('metadata.csv', encoding='utf-8')
Beispiel #16
0
def categorical_encoding(A, B, y_train, encoder, clf_type, n_jobs):
    '''Build the matrix of encoders.
    Given two arrays of strings to compare an a encoder, returns the
    corresponding encoder matrix of size len(A)xlen(B)'''

    if encoder == 'levenshtein-ratio_SimilarityEncoder':
        B = np.unique(B).reshape(-1, 1)
        encoder = SimilarityEncoder(similarity='levenshtein-ratio')
        encoder.fit(B)
        se = encoder.transform(A.reshape(-1, 1))
        return se
    if encoder == 'one-hot_encoding':
        return one_hot_encoding(A, B)
    if encoder == 'one-hot_encoding_sparse':
        return sparse.csr_matrix(one_hot_encoding_sparse(A, B))
    if encoder == 'jaccard_similarity':
        B = np.unique(B)
        warning = (('Warning: %s is not a well defined similarity ' +
                    'metric because two different values can have a ' +
                    'similarity of 1') % encoder)
        print(warning)
        unqA = np.unique(A)
        vlev = np.vectorize(dist.jaccard)
        # dvec = Parallel(n_jobs=n_jobs)(delayed(vlev)(a, B.reshape(1, -1))
        #                           for a in unqA)
        dvec = [vlev(a, B.reshape(1, -1)) for a in unqA]
        ddict = {unqA[i]: dvec[i] for i in range(len(dvec))}
        dms = (ddict[a] for a in A)
        dm = np.vstack(dms)
        return 1 - dm
    if encoder == 'sorensen_similarity':
        B = np.unique(B)
        unqA = np.unique(A)
        vlev = np.vectorize(dist.sorensen)
        # dvec = Parallel(n_jobs=n_jobs)(delayed(vlev)(a, B.reshape(1, -1))
        #                           for a in unqA)
        dvec = [vlev(a, B.reshape(1, -1)) for a in unqA]
        ddict = {unqA[i]: dvec[i] for i in range(len(dvec))}
        dms = (ddict[a] for a in A)
        dm = np.vstack(dms)
        return 1 - dm
    if encoder == 'jaro-winkler_SimilarityEncoder':
        B = np.unique(B).reshape(-1, 1)
        encoder = SimilarityEncoder(similarity='jaro-winkler')
        encoder.fit(B)
        se = encoder.transform(A.reshape(-1, 1))
        return se
    if encoder[1:] == 'gram_SimilarityEncoder':
        n = int(encoder[0])
        B = np.unique(B).reshape(-1, 1)
        encoder = SimilarityEncoder()
        encoder.fit(B)
        return encoder.transform(A.reshape(-1, 1))
    if encoder[1:] == 'gram_similarity2':
        n = int(encoder[0])
        B = np.unique(B)
        return ngram_similarity(A, B, n, sim_type='sim2')
    if encoder[1:] == 'gram_presence_fisher_kernel':
        n = int(encoder[0])
        return ngram_similarity(A, B, n, sim_type='fisher_kernel')
    if encoder[1:] == 'gram_similarity2_1':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim2_1')
        return sm
    if encoder[1:] == 'gram_similarity2_2':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim2_2')
        return sm
    if encoder[1:] == 'gram_similarity3':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim3')
        return sm
    if encoder[1:] == 'gram_similarity3_2':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim3_2')
        return sm
    if encoder[1:] == 'gram_similarity4':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim4')
        return sm
    if encoder[1:] == 'gram_similarity5':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim5')
        return sm
    if encoder[1:] == 'gram_similarity6':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim6')
        return sm
    if encoder[1:] == 'gram_similarity7':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim7')
        return sm
    if encoder[1:] == 'grams_count_vectorizer':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n))
        vectorizer.fit(B)
        return vectorizer.transform(A)
    if encoder[1:] == 'grams_tfidf_vectorizer':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = TfidfVectorizer(analyzer='char',
                                     ngram_range=(n, n),
                                     smooth_idf=False)
        vectorizer.fit(B)
        return vectorizer.transform(A)
    if encoder[1:] == 'grams_tf_vectorizer':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = TfidfVectorizer(analyzer='char',
                                     ngram_range=(n, n),
                                     smooth_idf=False,
                                     use_idf=False)
        vectorizer.fit(B)
        return vectorizer.transform(A)
    if encoder[1:] == 'grams_hot_vectorizer':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n))
        vectorizer.fit(B)
        count_matrix1 = vectorizer.transform(A)
        return (count_matrix1 > 0).astype('float64')
    if encoder[1:] == 'grams_hot_vectorizer_tfidf':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n))
        presenceB = (vectorizer.fit_transform(B) > 0).astype('float64')
        presenceA = (vectorizer.transform(A) > 0).astype('float64')
        transformer = TfidfTransformer(smooth_idf=True)
        transformer.fit(presenceB)
        tfidfA = transformer.transform(presenceA)
        return tfidfA
    if encoder[1:] == 'grams_hashing':
        n = int(encoder[0])
        hashingA = ngrams_hashing_vectorizer(A, n, 10000)
        return hashingA
    if encoder == 'TargetEncoder':
        encoder = TargetEncoder(clf_type=clf_type, handle_unknown='ignore')
        encoder.fit(B.reshape(-1, 1), y_train)
        return encoder.transform(A.reshape(-1, 1))
    if encoder == 'MDVEncoder':
        return mdv_encoding(A, B, y_train, clf_type)
    if encoder == 'BackwardDifferenceEncoder':
        encoder = ce.BackwardDifferenceEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'BinaryEncoder':
        encoder = ce.BinaryEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'HashingEncoder':
        encoder = ce.HashingEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'HelmertEncoder':
        encoder = ce.HelmertEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'OneHotEncoder':
        encoder = ce.OneHotEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'OrdinalEncoder':
        encoder = ce.OrdinalEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'SumEncoder':
        encoder = ce.SumEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'PolynomialEncoder':
        encoder = ce.PolynomialEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'BaseNEncoder':
        encoder = ce.BaseNEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'LeaveOneOutEncoder':
        encoder = ce.LeaveOneOutEncoder()
        encoder.fit(B, y_train)
        return encoder.transform(A)
    else:
        message = 'Encoder %s has not been implemented yet.' % encoder
        return message