Exemple #1
0
def profile_encoder(encoder, hashing='fast', minmax_hash=False):
    # not an unit test

    from dirty_cat.datasets import fetch_employee_salaries
    employee_salaries = fetch_employee_salaries()
    df = employee_salaries.X
    X = df[["employee_position_title"]]
    t0 = time.time()
    enc = encoder(n_components=50, hashing=hashing, minmax_hash=minmax_hash)
    enc.fit(X)
    y = enc.transform(X)
    assert y.shape == (len(X), 50)
    eta = time.time() - t0
    return eta
Exemple #2
0
def profile_encoder(Encoder, init):
    # not an unit test
    
    from dirty_cat import datasets
    employee_salaries = datasets.fetch_employee_salaries()
    data = employee_salaries['data']
    X = data['employee_position_title'].tolist()
    t0 = time.time()
    encoder = Encoder(n_components=50, init=init)
    encoder.fit(X)
    y = encoder.transform(X)
    assert y.shape == (len(X), 50)
    eta = time.time() - t0
    return eta
def profile_encoder(Encoder, init):
    # not an unit test

    from dirty_cat.datasets import fetch_employee_salaries
    info = fetch_employee_salaries()
    data = pd.read_csv(info['path'], **info['read_csv_kwargs'])
    X = np.array(data['employee_position_title'])[:, None]
    t0 = time.time()
    encoder = Encoder(n_components=50, init=init)
    encoder.fit(X)
    y = encoder.transform(X)
    assert y.shape == (len(X), 50)
    eta = time.time() - t0
    return eta
def profile_encoder(Encoder, hashing='fast', minmax_hash=False):
    # not an unit test
    
    from dirty_cat import datasets
    import pandas as pd
    employee_salaries = datasets.fetch_employee_salaries()
    data = pd.read_csv(employee_salaries['path'])
    X = data['Employee Position Title'].tolist()
    X = X * 10
    t0 = time.time()
    encoder = Encoder(n_components=50, hashing=hashing, minmax_hash=minmax_hash)
    encoder.fit(X)
    y = encoder.transform(X)
    assert y.shape == (len(X), 50)
    eta = time.time() - t0
    return eta
Exemple #5
0
from sklearn.model_selection import KFold

from dirty_cat import datasets
from dirty_cat import SimilarityEncoder, TargetEncoder


# encoding methods
encoder_dict = {
    'one-hot': OneHotEncoder(handle_unknown='ignore'),
    'similarity': SimilarityEncoder(similarity='ngram',
                                    handle_unknown='ignore'),
    'target': TargetEncoder(handle_unknown='ignore'),
    'num': FunctionTransformer(None)
    }

data_file = datasets.fetch_employee_salaries()

for method in ['one-hot', 'target', 'similarity']:
    # Load the data
    df = pd.read_csv(data_file).astype(str)
    df['Current Annual Salary'] = [float(s[1:]) for s
                                   in df['Current Annual Salary']]
    df['Year First Hired'] = [int(s.split('/')[-1])
                              for s in df['Date First Hired']]

    target_column = 'Current Annual Salary'
    y = df[target_column].values.ravel()

    # Transform the data into a numerical matrix
    encoder_type = {
        'one-hot': ['Gender', 'Department Name', 'Assignment Category'],
Exemple #6
0
     :class:`~dirty_cat.GapEncoder`

 .. |SE| replace:: :class:`~dirty_cat.SimilarityEncoder`

 .. |permutation importances| replace::
     :func:`~sklearn.inspection.permutation_importance`
"""

# %%
#
# The data
# ========
#
# We first retrieve the dataset:
from dirty_cat.datasets import fetch_employee_salaries
employee_salaries = fetch_employee_salaries()

# %%
# X, the input data (descriptions of employees):
X = employee_salaries.X
X

# %%
# and y, our target column (the annual salary)
y = employee_salaries.y
y.name

# %%
# Now, let's carry out some basic preprocessing:
import pandas as pd
X['date_first_hired'] = pd.to_datetime(X['date_first_hired'])
Exemple #7
0
Investigating dirty categories
=================================

What are dirty categorical variables and how can a good encoding help
with statistical learning.
"""

#########################################################################
# What do we mean by dirty categories?
# -------------------------------------------------
#
# Let's look at a dataset called employee salaries:
import pandas as pd
from dirty_cat import datasets

employee_salaries = datasets.fetch_employee_salaries()
print(employee_salaries['description'])
data = pd.read_csv(employee_salaries['path'])
print(data.head(n=5))

#########################################################################
# Here is how many unique entries there is per column
print(data.nunique())

#########################################################################
# As we can see, some entries have many different unique values:
print(data['Employee Position Title'].value_counts().sort_index())

#########################################################################
# These different entries are often variations on the same entities:
# there are 3 kinds of Accountant/Auditor.