def test_get_unique_ngrams():
    string = 'test'
    true_ngrams = {(' ', 't'), ('t', 'e'), ('e', 's'), ('s', 't'), ('t', ' '),
                   (' ', 't', 'e'), ('t', 'e', 's'), ('e', 's', 't'),
                   ('s', 't', ' '), (' ', 't', 'e', 's'), ('t', 'e', 's', 't'),
                   ('e', 's', 't', ' ')}
    ngram_range = (2, 4)
    enc = MinHashEncoder(n_components=2)
    ngrams = enc.get_unique_ngrams(string, ngram_range)
    assert ngrams == true_ngrams
def test_cache_overflow():
    # Regression test for cache overflow resulting in -1s in encoding
    def get_random_string(length):
        letters = ascii_lowercase
        result_str = ''.join(random.choice(letters) for i in range(length))
        return result_str

    encoder = MinHashEncoder(n_components=3)
    capacity = encoder._capacity
    raw_data = [get_random_string(10) for x in range(capacity + 1)]
    y = encoder.fit_transform(raw_data)

    assert len(y[y == -1.0]) == 0
Ejemplo n.º 3
0
def test_multiple_columns():
    """ This test is intented to verify that fitting multiple columns
        with the MinHashEncoder will not produce an error, and will 
        encode the column independently """
    X = pd.DataFrame([('bird', 'parrot'), ('bird', 'nightingale'),
                      ('mammal', 'monkey'), ('mammal', np.nan)],
                     columns=('class', 'type'))
    X1 = X[['class']]
    X2 = X[['type']]
    fit1 = MinHashEncoder(n_components=30).fit_transform(X1)
    fit2 = MinHashEncoder(n_components=30).fit_transform(X2)
    fit = MinHashEncoder(n_components=30).fit_transform(X)
    assert np.array_equal(np.array([fit[:, :30], fit[:, 30:60]]),
                          np.array([fit1, fit2]))
def test_input_type():
    # Numpy array
    X = np.array(['alice', 'bob'])
    enc = MinHashEncoder(n_components=2)
    enc.fit_transform(X)
    # List
    X = ['alice', 'bob']
    enc = MinHashEncoder(n_components=2)
    enc.fit_transform(X)
Ejemplo n.º 5
0
# The one-hot encoder is actually not well suited to the 'Employee
# Position Title' column, as this columns contains 400 different entries:
import numpy as np
np.unique(y)

# %%
# We will now experiment with encoders specially made for handling
# dirty columns
from dirty_cat import SimilarityEncoder, TargetEncoder, MinHashEncoder,\
    GapEncoder

encoders = {
    'one-hot': one_hot,
    'similarity': SimilarityEncoder(similarity='ngram'),
    'target': TargetEncoder(handle_unknown='ignore'),
    'minhash': MinHashEncoder(n_components=100),
    'gap': GapEncoder(n_components=100),
}

# %%
# We now loop over the different encoding methods,
# instantiate a new |Pipeline| each time, fit it
# and store the returned cross-validation score:

from sklearn.model_selection import cross_val_score

all_scores = dict()

for name, method in encoders.items():
    encoder = make_column_transformer(
        (one_hot, ['gender', 'department_name', 'assignment_category']),
Ejemplo n.º 6
0
# The encoders for both clean and dirty data are first imported:

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from dirty_cat import SimilarityEncoder, TargetEncoder, MinHashEncoder

encoders_dict = {
    'one-hot':
    OneHotEncoder(handle_unknown='ignore', sparse=False),
    'similarity':
    SimilarityEncoder(similarity='ngram'),
    'target':
    TargetEncoder(handle_unknown='ignore'),
    'minhash':
    MinHashEncoder(n_components=10,
                   ngram_range=(2, 4),
                   hashing='fast',
                   minmax_hash=False),
    'numerical':
    FunctionTransformer(None)
}

# We then create a function that takes one key of our ``encoders_dict``,
# returns a pipeline object with the associated encoder,
# as well as a Scaler and a RidgeCV regressor:

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


def make_pipeline(encoding_method):
    # static transformers from the other columns
def test_missing_values(input_type, missing, hashing):
    X = [
        'Red', np.nan, 'green', 'blue', 'green', 'green', 'blue',
        float('nan')
    ]
    n = 3
    z = np.zeros(n)

    if input_type == 'numpy':
        X = np.array(X, dtype=object)
    elif input_type == 'pandas':
        pd = pytest.importorskip("pandas")
        X = pd.DataFrame(X)

    encoder = MinHashEncoder(n_components=n,
                             hashing=hashing,
                             minmax_hash=False,
                             handle_missing=missing)
    if missing == 'error':
        encoder.fit(X)
        if input_type in ['numpy', 'pandas']:
            with pytest.raises(ValueError, match=r"missing"
                               " values in input"):
                encoder.transform(X)
    elif missing == '':
        encoder.fit(X)
        y = encoder.transform(X)
        if input_type == 'list':
            assert np.allclose(y[1], y[-1])
        else:
            assert np.array_equal(y[1], z)
            assert np.array_equal(y[-1], z)
    else:
        with pytest.raises(ValueError,
                           match=r"handle_missing"
                           " should be either 'error' or ''"):
            encoder.fit_transform(X)
    return
def test_MinHashEncoder(n_sample=70, minmax_hash=False):
    X_txt = fetch_20newsgroups(subset='train')['data']
    X = X_txt[:n_sample]

    for minmax_hash in [True, False]:
        for hashing in ['fast', 'murmur']:

            if minmax_hash and hashing == 'murmur':
                pass  # not implemented

            # Test output shape
            encoder = MinHashEncoder(n_components=50, hashing=hashing)
            encoder.fit(X)
            y = encoder.transform(X)
            assert y.shape == (n_sample, 50), str(y.shape)
            assert len(set(y[0])) == 50

            # Test same seed return the same output
            encoder = MinHashEncoder(50, hashing=hashing)
            encoder.fit(X)
            y2 = encoder.transform(X)
            np.testing.assert_array_equal(y, y2)

            # Test min property
            if not minmax_hash:
                X_substring = [x[:x.find(' ')] for x in X]
                encoder = MinHashEncoder(50, hashing=hashing)
                encoder.fit(X_substring)
                y_substring = encoder.transform(X_substring)
                np.testing.assert_array_less(y - y_substring, 0.0001)
    'Age',
    'Household Income',
    'Education']
y = df[target_column].values.ravel()

##############################################################################
# A pipeline for data fitting and prediction
# -------------------------------------------
# We first import the right encoders to transform our clean/dirty data:
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from dirty_cat import SimilarityEncoder, MinHashEncoder

encoder_dict = {
    'one-hot': OneHotEncoder(handle_unknown='ignore', sparse=False),
    'similarity': SimilarityEncoder(similarity='ngram'),
    'minhash': MinHashEncoder(),
    'num': FunctionTransformer(None)
}
##############################################################################
# All the clean columns are encoded once and for all, but since we
# benchmark different categorical encodings for the dirty variable,
# we create a function that takes an encoding as an input, and returns a \
# scikit-learn pipeline for our problem.
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier


def make_pipeline(encoding_method):
    # static transformers from the other columns