def test_transform_selected_2():
    """Assert _transform_selected return original X when selected is a list of False values"""
    ohe = OneHotEncoder(categorical_features=[False, False, False])
    X = _transform_selected(dense1,
                            ohe._fit_transform,
                            ohe.categorical_features,
                            copy=True)
    assert np.allclose(X, dense1)
def test_transform():
    """Test OneHotEncoder with both dense and sparse matrixes."""
    input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
    ohe = OneHotEncoder()
    ohe.fit(input)
    test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
    output = ohe.transform(test_data).todense()
    assert np.sum(output) == 5

    input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
    ips = scipy.sparse.csr_matrix(input)
    ohe = OneHotEncoder()
    ohe.fit(ips)
    test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
    tds = scipy.sparse.csr_matrix(test_data)
    output = ohe.transform(tds).todense()
    assert np.sum(output) == 3
def test_transform_selected():
    """Assert _transform_selected return original X when selected is empty list"""
    ohe = OneHotEncoder(categorical_features=[])
    X = _transform_selected(dense1,
                            ohe._fit_transform,
                            ohe.categorical_features,
                            copy=True)
    assert np.allclose(X, dense1)
Esempio n. 4
0
def fit_then_transform_dense(expected, input,
                             categorical_features='all',
                             minimum_fraction=None):
    ohe = OneHotEncoder(categorical_features=categorical_features,
                        sparse=False, minimum_fraction=minimum_fraction)
    transformation = ohe.fit_transform(input.copy())
    assert_array_almost_equal(expected, transformation)

    ohe2 = OneHotEncoder(categorical_features=categorical_features,
                         sparse=False, minimum_fraction=minimum_fraction)
    ohe2.fit(input.copy())
    transformation = ohe2.transform(input.copy())
    assert_array_almost_equal(expected, transformation)
def test_k_fold_cv():
    """Test OneHotEncoder with categorical_features='auto'."""
    boston = load_boston()
    clf = make_pipeline(
        OneHotEncoder(categorical_features='auto',
                      sparse=False,
                      minimum_fraction=0.05), LinearRegression())

    cross_val_score(clf,
                    boston.data,
                    boston.target,
                    cv=KFold(n_splits=10, shuffle=True))
Esempio n. 6
0
def fit_then_transform(expected, input, categorical_features='all',
                       minimum_fraction=None):
    # Test fit_transform
    ohe = OneHotEncoder(categorical_features=categorical_features,
                        minimum_fraction=minimum_fraction)
    transformation = ohe.fit_transform(input.copy())
    assert_array_almost_equal(expected.astype(float),
                              transformation.todense())

    # Test fit, and afterwards transform
    ohe2 = OneHotEncoder(categorical_features=categorical_features,
                         minimum_fraction=minimum_fraction)
    ohe2.fit(input.copy())
    transformation = ohe2.transform(input.copy())
    assert_array_almost_equal(expected, transformation.todense())
Esempio n. 7
0
def test_transform():
    """Test OneHotEncoder with both dense and sparse matrixes."""
    input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
    ohe = OneHotEncoder()
    ohe.fit(input)
    test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
    output = ohe.transform(test_data).todense()
    assert np.sum(output) == 5

    input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
    ips = scipy.sparse.csr_matrix(input)
    ohe = OneHotEncoder()
    ohe.fit(ips)
    test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
    tds = scipy.sparse.csr_matrix(test_data)
    output = ohe.transform(tds).todense()
    assert np.sum(output) == 3
Esempio n. 8
0
def fit_then_transform_dense(expected, input,
                             categorical_features='all',
                             minimum_fraction=None):
    ohe = OneHotEncoder(categorical_features=categorical_features,
                        sparse=False, minimum_fraction=minimum_fraction)
    transformation = ohe.fit_transform(input.copy())
    assert_array_almost_equal(expected, transformation)

    ohe2 = OneHotEncoder(categorical_features=categorical_features,
                         sparse=False, minimum_fraction=minimum_fraction)
    ohe2.fit(input.copy())
    transformation = ohe2.transform(input.copy())
    assert_array_almost_equal(expected, transformation)
Esempio n. 9
0
def fit_then_transform(expected, input, categorical_features='all',
                       minimum_fraction=None):
    # Test fit_transform
    ohe = OneHotEncoder(categorical_features=categorical_features,
                        minimum_fraction=minimum_fraction)
    transformation = ohe.fit_transform(input.copy())
    assert_array_almost_equal(expected.astype(float),
                              transformation.todense())

    # Test fit, and afterwards transform
    ohe2 = OneHotEncoder(categorical_features=categorical_features,
                         minimum_fraction=minimum_fraction)
    ohe2.fit(input.copy())
    transformation = ohe2.transform(input.copy())
    assert_array_almost_equal(expected, transformation.todense())
Esempio n. 10
0
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler, Normalizer
from tpot.builtins import OneHotEncoder, StackingEstimator
from xgboost import XGBClassifier
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:0.84550605863897
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            OneHotEncoder(minimum_fraction=0.25, sparse=False, threshold=10),
            RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.5, n_estimators=100), step=0.2),
            MinMaxScaler()
        ),
        FunctionTransformer(copy)
    ),
    Normalizer(norm="max"),
    XGBClassifier(learning_rate=0.01, max_depth=6, min_child_weight=7, n_estimators=600, nthread=1, subsample=0.9500000000000001)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Esempio n. 11
0
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler
from tpot.builtins import OneHotEncoder, StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9699834298007317
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=RandomForestClassifier(bootstrap=True,
                                                       criterion="gini",
                                                       max_features=0.05,
                                                       min_samples_leaf=1,
                                                       min_samples_split=6,
                                                       n_estimators=100)),
    OneHotEncoder(minimum_fraction=0.25, sparse=False), MinMaxScaler(),
    LogisticRegression(C=25.0, dual=False, penalty="l1"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from tpot.builtins import OneHotEncoder, StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8896296296296295
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        make_union(
            make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
            FunctionTransformer(copy))),
    OneHotEncoder(minimum_fraction=0.25, sparse=False),
    LinearSVC(C=20.0, dual=True, loss="hinge", penalty="l2", tol=0.0001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Esempio n. 13
0
def test_refit_on_new_data():
    """Test that OneHotEncoder can refit on two data sets."""
    ohe = OneHotEncoder()
    ohe.fit(dense1)
    ohe.fit(dense2)
from sklearn.feature_selection import SelectFwe, SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from tpot.builtins import OneHotEncoder, StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: 0.3656802383316783
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(SelectFwe(score_func=f_classif, alpha=0.008),
                      OneHotEncoder(minimum_fraction=0.1),
                      SelectPercentile(score_func=f_classif, percentile=13)),
        FunctionTransformer(copy)),
    LinearSVC(C=0.1, dual=False, loss="squared_hinge", penalty="l2", tol=0.01))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from tpot.builtins import OneHotEncoder

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9866666666666667
exported_pipeline = make_pipeline(
    OneHotEncoder(minimum_fraction=0.2, sparse=False), Normalizer(norm="max"),
    LogisticRegression(C=25.0, dual=True, penalty="l2"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
import numpy as np
import pandas as pd
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, PolynomialFeatures
from tpot.builtins import OneHotEncoder

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

imputer = Imputer(strategy="median")
imputer.fit(training_features)
training_features = imputer.transform(training_features)
testing_features = imputer.transform(testing_features)

# Score on the training set was:1.0
exported_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    Nystroem(gamma=0.05, kernel="poly", n_components=7),
    OneHotEncoder(minimum_fraction=0.05, sparse=False), GaussianNB())

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Esempio n. 17
0
from sklearn.svm import LinearSVC
from tpot.builtins import OneHotEncoder, StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8390633822699041
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            SelectPercentile(score_func=f_classif, percentile=90),
            StackingEstimator(
                estimator=LogisticRegression(C=0.01, dual=True, penalty="l2")),
            SelectPercentile(score_func=f_classif, percentile=76)),
        FunctionTransformer(copy)),
    StackingEstimator(estimator=LinearSVC(
        C=25.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.1)),
    SelectPercentile(score_func=f_classif, percentile=70),
    OneHotEncoder(minimum_fraction=0.1, sparse=False), StandardScaler(),
    LinearSVC(C=0.001, dual=True, loss="hinge", penalty="l2", tol=0.01))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)