Esempio n. 1
0
def test_mut_operator_stats_update():
    """Asserts that self._random_mutation_operator updates stats as expected."""
    tpot_obj = TPOTClassifier()
    ind = creator.Individual.from_string(
        'KNeighborsClassifier('
        'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=False),'
        'KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=1, '
        'KNeighborsClassifier__weights=uniform'
        ')',
        tpot_obj._pset
    )

    initialize_stats_dict(ind)

    ind.statistics["crossover_count"] = random.randint(0, 10)
    ind.statistics["mutation_count"] = random.randint(0, 10)

    # set as evaluated pipelines in tpot_obj.evaluated_individuals_
    tpot_obj.evaluated_individuals_[str(ind)] = tpot_obj._combine_individual_stats(2, 0.99, ind.statistics)

    for _ in range(10):
        offspring, = tpot_obj._random_mutation_operator(ind)

        assert offspring.statistics['crossover_count'] == ind.statistics['crossover_count']
        assert offspring.statistics['mutation_count'] == ind.statistics['mutation_count'] + 1
        assert offspring.statistics['predecessor'] == (str(ind),)

        ind = offspring
Esempio n. 2
0
def test_fit2():
    """Assert that the TPOT fit function provides an optimized pipeline when config_dict is \'TPOT light\'"""
    tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, config_dict='TPOT light')
    tpot_obj.fit(training_features, training_classes)

    assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
    assert not (tpot_obj._start_datetime is None)
Esempio n. 3
0
def test_random_ind_2():
    """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 45"""

    tpot_obj = TPOTClassifier(random_state=45)
    tpot_obj._pbar = tqdm(total=1, disable=True)
    pipeline = tpot_obj._toolbox.individual()
    expected_code = """import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.built_in_operators import ZeroCount

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \\
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    ZeroCount(),
    LogisticRegression(C=0.0001, dual=False, penalty="l2")
)

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
"""
    assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
Esempio n. 4
0
def test_export_random_ind():
    """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39."""
    tpot_obj = TPOTClassifier(random_state=39)
    tpot_obj._pbar = tqdm(total=1, disable=True)
    pipeline = tpot_obj._toolbox.individual()
    expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'].values, random_state=42)

exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_classif, percentile=65),
    DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=4, min_samples_split=18)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
    assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
Esempio n. 5
0
def test_pipeline_score_save():
    """Assert that the TPOTClassifier can generate a scored pipeline export correctly."""
    tpot_obj = TPOTClassifier()
    tpot_obj._fit_init()
    tpot_obj._pbar = tqdm(total=1, disable=True)
    pipeline_string = (
        'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),'
        'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,'
        'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)'
    )
    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:0.929813743
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_classif, percentile=20),
    DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
    assert_equal(expected_code, export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, pipeline_score=0.929813743))
Esempio n. 6
0
def test_fit():
    """Assert that the TPOT fit function provides an optimized pipeline"""
    tpot_obj = TPOTClassifier(random_state=42, population_size=1, generations=1, verbosity=0)
    tpot_obj.fit(training_features, training_classes)

    assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
    assert tpot_obj._gp_generation == 0
    assert not (tpot_obj._start_datetime is None)
Esempio n. 7
0
def test_gen():
    """Assert that TPOT's gen_grow_safe function returns a pipeline of expected structure"""
    tpot_obj = TPOTClassifier()

    pipeline = tpot_obj._gen_grow_safe(tpot_obj._pset, 1, 3)

    assert len(pipeline) > 1
    assert pipeline[0].ret == Output_DF
Esempio n. 8
0
def test_invaild_dataset_warning():
    """Assert that the TPOT fit function raises a ValueError when dataset is not in right format"""
    tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0)
    bad_training_classes = training_classes.reshape((1, len(training_classes)))# common mistake in classes
    try:
        tpot_obj.fit(training_features ,bad_training_classes) # typo for balanced_accuracy
        assert False
    except ValueError:
        pass
Esempio n. 9
0
def test_export():
    """Assert that TPOT's export function throws a ValueError when no optimized pipeline exists"""
    tpot_obj = TPOTClassifier()

    try:
        tpot_obj.export("test_export.py")
        assert False  # Should be unreachable
    except ValueError:
        pass
Esempio n. 10
0
def test_predict():
    """Assert that the TPOT predict function raises a ValueError when no optimized pipeline exists"""

    tpot_obj = TPOTClassifier()

    try:
        tpot_obj.predict(testing_features)
        assert False  # Should be unreachable
    except ValueError:
        pass
Esempio n. 11
0
def test_score():
    """Assert that the TPOT score function raises a RuntimeError when no optimized pipeline exists"""

    tpot_obj = TPOTClassifier()

    try:
        tpot_obj.score(testing_features, testing_classes)
        assert False  # Should be unreachable
    except RuntimeError:
        pass
Esempio n. 12
0
def test_predict_2():
    """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)"""

    tpot_obj = TPOTClassifier()
    tpot_obj._optimized_pipeline = creator.Individual.\
        from_string('DecisionTreeClassifier(input_matrix)', tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    result = tpot_obj.predict(testing_features)

    assert result.shape == (testing_features.shape[0],)
Esempio n. 13
0
def test_dict_initialization():
    """Asserts that gp_deap.initialize_stats_dict initializes individual statistics correctly"""
    tpot_obj = TPOTClassifier()
    tpot_obj._fit_init()
    tb = tpot_obj._toolbox

    test_ind = tb.individual()
    initialize_stats_dict(test_ind)

    assert test_ind.statistics['generation'] == 0
    assert test_ind.statistics['crossover_count'] == 0
    assert test_ind.statistics['mutation_count'] == 0
    assert test_ind.statistics['predecessor'] == ('ROOT',)
Esempio n. 14
0
def test_imputer_in_export():
    """Assert that TPOT exports a pipeline with an imputation step if imputation was used in fit()."""
    tpot_obj = TPOTClassifier(
        random_state=42,
        population_size=1,
        offspring_size=2,
        generations=1,
        verbosity=0,
        config_dict='TPOT light'
    )
    features_with_nan = np.copy(training_features)
    features_with_nan[0][0] = float('nan')

    tpot_obj.fit(features_with_nan, training_target)
    # use fixed pipeline since the random.seed() performs differently in python 2.* and 3.*
    pipeline_string = (
        'KNeighborsClassifier('
        'input_matrix, '
        'KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=1, '
        'KNeighborsClassifier__weights=uniform'
        ')'
    )
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)

    export_code = export_pipeline(tpot_obj._optimized_pipeline, tpot_obj.operators, tpot_obj._pset, tpot_obj._imputed)

    expected_code = """import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Imputer

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'].values, random_state=None)

imputer = Imputer(strategy="median")
imputer.fit(training_features)
training_features = imputer.transform(training_features)
testing_features = imputer.transform(testing_features)

exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform")

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""

    assert_equal(export_code, expected_code)
Esempio n. 15
0
def test_predict_2():
    """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)"""

    tpot_obj = TPOTClassifier()
    pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
    ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
    'DecisionTreeClassifier__min_samples_split=5)')
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    result = tpot_obj.predict(testing_features)

    assert result.shape == (testing_features.shape[0],)
Esempio n. 16
0
def test_mate_operator_stats_update():
    """Assert that self._mate_operator updates stats as expected."""
    tpot_obj = TPOTClassifier()
    ind1 = creator.Individual.from_string(
        'KNeighborsClassifier('
        'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=False),'
        'KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=1, '
        'KNeighborsClassifier__weights=uniform'
        ')',
        tpot_obj._pset
    )
    ind2 = creator.Individual.from_string(
        'KNeighborsClassifier('
        'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=True),'
        'KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=2, '
        'KNeighborsClassifier__weights=uniform'
        ')',
        tpot_obj._pset
    )

    initialize_stats_dict(ind1)
    initialize_stats_dict(ind2)

    # Randomly mutate the statistics
    ind1.statistics["crossover_count"] = random.randint(0, 10)
    ind1.statistics["mutation_count"] = random.randint(0, 10)
    ind2.statistics["crossover_count"] = random.randint(0, 10)
    ind2.statistics["mutation_count"] = random.randint(0, 10)

    # set as evaluated pipelines in tpot_obj.evaluated_individuals_
    tpot_obj.evaluated_individuals_[str(ind1)] = tpot_obj._combine_individual_stats(2, 0.99, ind1.statistics)
    tpot_obj.evaluated_individuals_[str(ind2)] = tpot_obj._combine_individual_stats(2, 0.99, ind2.statistics)

    # Doing 10 tests
    for _ in range(10):
        offspring1, offspring2 = tpot_obj._mate_operator(ind1, ind2)

        assert offspring1.statistics['crossover_count'] == ind1.statistics['crossover_count'] + ind2.statistics['crossover_count'] + 1
        assert offspring1.statistics['mutation_count'] == ind1.statistics['mutation_count'] + ind2.statistics['mutation_count']
        assert offspring1.statistics['predecessor'] == (str(ind1), str(ind2))

        # Offspring replaces on of the two predecessors
        # Don't need to worry about cloning
        if random.random() < 0.5:
            ind1 = offspring1
        else:
            ind2 = offspring1
Esempio n. 17
0
def test_gp_new_generation():
    """Assert that the gp_generation count gets incremented when _gp_new_generation is called"""
    tpot_obj = TPOTClassifier()
    tpot_obj._pbar = tqdm(total=1, disable=True)

    assert tpot_obj._gp_generation == 0

    # Since _gp_new_generation is a decorator, and we dont want to run a full
    # fit(), decorate a dummy function and then call the dummy function.
    @_gp_new_generation
    def dummy_function(self, foo):
        pass

    dummy_function(tpot_obj, None)

    assert tpot_obj._gp_generation == 1
Esempio n. 18
0
def test_get_params():
    """Assert that get_params returns the exact dictionary of parameters used by TPOT"""

    kwargs = {
        'population_size': 500,
        'generations': 1000,
        'verbosity': 1
    }

    tpot_obj = TPOTClassifier(**kwargs)

    # Get default parameters of TPOT and merge with our specified parameters
    initializer = inspect.getargspec(TPOTBase.__init__)
    default_kwargs = dict(zip(initializer.args[1:], initializer.defaults))
    default_kwargs.update(kwargs)

    assert tpot_obj.get_params() == default_kwargs
Esempio n. 19
0
def test_export():
    """Assert that TPOT's export function throws a RuntimeError when no optimized pipeline exists."""
    tpot_obj = TPOTClassifier()
    assert_raises(RuntimeError, tpot_obj.export, "test_export.py")
    pipeline_string = (
        'KNeighborsClassifier(CombineDFs('
        'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, '
        'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
        'DecisionTreeClassifier__min_samples_split=5), ZeroCount(input_matrix))'
        'KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform'
    )

    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._optimized_pipeline = pipeline
    tpot_obj.export("test_export.py")
    assert path.isfile("test_export.py")
    remove("test_export.py") # clean up exported file
Esempio n. 20
0
def test_get_params():
    """Assert that get_params returns the exact dictionary of parameters used by TPOT"""

    kwargs = {
        'population_size': 500,
        'generations': 1000,
        'config_dict': 'TPOT light',
        'offspring_size': 2000,
        'verbosity': 1
    }

    tpot_obj = TPOTClassifier(**kwargs)
    # Get default parameters of TPOT and merge with our specified parameters
    initializer = inspect.getargspec(TPOTBase.__init__)
    default_kwargs = dict(zip(initializer.args[1:], initializer.defaults))
    default_kwargs.update(kwargs)
    # update to dictionary instead of input string
    default_kwargs.update({'config_dict': classifier_config_dict_light})
    assert tpot_obj.get_params()['config_dict'] == default_kwargs['config_dict']
    assert tpot_obj.get_params() == default_kwargs
Esempio n. 21
0
def test_score_2():
    """Assert that the TPOTClassifier score function outputs a known score for a fix pipeline"""

    tpot_obj = TPOTClassifier()
    known_score = 0.977777777778  # Assumes use of the TPOT balanced_accuracy function

    # Reify pipeline with known score
    pipeline_string= ('KNeighborsClassifier(input_matrix, KNeighborsClassifier__n_neighbors=10, '
    'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform)')
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)
    # Get score from TPOT
    score = tpot_obj.score(testing_features, testing_classes)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
Esempio n. 22
0
def test_score_2():
    """Assert that the TPOTClassifier score function outputs a known score for a fixed pipeline"""

    tpot_obj = TPOTClassifier()
    tpot_obj._pbar = tqdm(total=1, disable=True)
    known_score = 0.986318199045  # Assumes use of the TPOT balanced_accuracy function

    # Reify pipeline with known score
    tpot_obj._optimized_pipeline = creator.Individual.\
        from_string('RandomForestClassifier(input_matrix)', tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    # Get score from TPOT
    score = tpot_obj.score(testing_features, testing_classes)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
Esempio n. 23
0
def test_export_random_ind():
    """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39."""
    tpot_obj = TPOTClassifier(random_state=39, config_dict="TPOT light")
    tpot_obj._fit_init()
    tpot_obj._pbar = tqdm(total=1, disable=True)
    pipeline = tpot_obj._toolbox.individual()
    expected_code = """import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'].values, random_state=39)

exported_pipeline = BernoulliNB(alpha=1.0, fit_prior=False)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
    assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, random_state=tpot_obj.random_state)
Esempio n. 24
0
def test_predict_proba2():
    """Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float)"""

    tpot_obj = TPOTClassifier()
    pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
    ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
    'DecisionTreeClassifier__min_samples_split=5)')
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    result = tpot_obj.predict_proba(testing_features)

    rows = result.shape[0]
    columns = result.shape[1]

    try:
        for i in range(rows):
            for j in range(columns):
                float_range(result[i][j])
        assert True
    except Exception:
        assert False
Esempio n. 25
0
def test_set_param_recursive_3():
    """Assert that set_param_recursive sets \"random_state\" to 42 in nested estimator in StackingEstimator in a complex pipeline."""
    pipeline_string = (
        'DecisionTreeClassifier(CombineDFs('
        'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, '
        'DecisionTreeClassifier__max_depth=8, DecisionTreeClassifier__min_samples_leaf=5,'
        'DecisionTreeClassifier__min_samples_split=5),input_matrix) '
        'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8, '
        'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)'
    )
    tpot_obj = TPOTClassifier()
    tpot_obj._fit_init()

    deap_pipeline = creator.Individual.from_string(pipeline_string,
                                                   tpot_obj._pset)
    sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline)
    set_param_recursive(sklearn_pipeline.steps, 'random_state', 42)

    # StackingEstimator under the transformer_list of FeatureUnion
    assert getattr(
        getattr(sklearn_pipeline.steps[0][1].transformer_list[0][1],
                'estimator'), 'random_state') == 42
    assert getattr(sklearn_pipeline.steps[1][1], 'random_state') == 42
Esempio n. 26
0
def test_generate_import_code():
    """Assert that generate_import_code() returns the correct set of dependancies for a given pipeline."""
    tpot_obj = TPOTClassifier()
    pipeline = creator.Individual.from_string(
        'GaussianNB(RobustScaler(input_matrix))', tpot_obj._pset)

    expected_code = """import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
"""
    assert expected_code == generate_import_code(pipeline, tpot_obj.operators)
def Classifier(x, y):
    x_train = x
    y_train = y
    tpot = TPOTClassifier(
        verbosity=2,
        max_time_mins=10,
        population_size=50,
    )
    tpot.fit(x_train, y_train)
    tpot.export('tpot_pipeline.py')
    TPOT_predict = tpot.predict(x_test)
    score = tpot.score(x_test, y_test)
    #print(score)
    #print(y_test)
    #print(TPOT_predict)
    return score
Esempio n. 28
0
def test_predict_proba2():
    """Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float)"""

    tpot_obj = TPOTClassifier()
    pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
    ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
    'DecisionTreeClassifier__min_samples_split=5)')
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    result = tpot_obj.predict_proba(testing_features)

    rows = result.shape[0]
    columns = result.shape[1]

    try:
        for i in range(rows):
            for j in range(columns):
                float_range(result[i][j])
        assert True
    except Exception:
        assert False
Esempio n. 29
0
File: tests.py Progetto: val922/tpot
def test_score_2():
    """Assert that the TPOTClassifier score function outputs a known score for a fix pipeline"""

    tpot_obj = TPOTClassifier()
    known_score = 0.977777777778  # Assumes use of the TPOT balanced_accuracy function

    # Reify pipeline with known score
    pipeline_string = (
        'KNeighborsClassifier(input_matrix, KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform)')
    tpot_obj._optimized_pipeline = creator.Individual.from_string(
        pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(
        expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)
    # Get score from TPOT
    score = tpot_obj.score(testing_features, testing_classes)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
Esempio n. 30
0
def test_export_random_ind():
    """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39."""
    tpot_obj = TPOTClassifier(random_state=39, config_dict="TPOT light")
    tpot_obj._fit_init()
    tpot_obj._pbar = tqdm(total=1, disable=True)
    pipeline = tpot_obj._toolbox.individual()
    expected_code = """import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'].values, random_state=39)

exported_pipeline = BernoulliNB(alpha=1.0, fit_prior=False)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
    assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, random_state=tpot_obj.random_state)
Esempio n. 31
0
class Tpot_example(Strategy):
    name = 'tpotExample'
    # We define out own parameters
    # machine learning classiffier
    tpotClassifier = TPOTClassifier(generations=5,
                                    population_size=20,
                                    cv=5,
                                    random_state=42,
                                    verbosity=2)
    treshold = 0.5
    currentPosition = 0

    ### **********************
    ### Override methods
    def init(self, dictionaryParameters, trainSet=None):
        if trainSet is None:
            raise Exception('Tpot algo requires a train set!')
        xTrain, yTrain = self.prepareDatasetForMachineLarning(trainSet)
        self.tpotClassifier.fit(xTrain, yTrain)
        trainingScore = self.tpotClassifier.score(xTrain, yTrain)
        print 'Finished tpotTraining with score %f' % trainingScore

    def onBar(self, bar):
        x, y = self.prepareDatasetForMachineLarning(bar)
        output = self.tpotClassifier.predict(x)

        # We buy only to exit a sell position (position<0 ) or in not position (position =0)
        if output > self.treshold and self.currentPosition <= 0:
            print 'Buy'
            self.currentPosition += 1

        # We sell only to exit a buy position (position>0 ) or in not position (position =0)
        elif output < self.treshold and self.currentPosition >= 0:
            print 'Sell'
            self.currentPosition -= 1
        else:
            print 'Nothing: current output %f' % self.currentPosition
        return Strategy.onBar(self, bar)

    ### **********************

    ### custom methods
    def prepareDatasetForMachineLarning(self, dataframeToPrepare):
        # input set
        input = dataframeToPrepare[['close', 'open', 'volume']]

        # target set
        output = dataframeToPrepare['close'].diff()

        return input, output
Esempio n. 32
0
def create_pipeline_selector(dataset:Dataset):
    """
    Create the PipelineSelector object responsible for executing the combinations
    in order to detect the best Pipeline according the provided configuration

    :param dataset: The Dataset object to be used by the PipelineSelector
    :return PipelineSelector: The Pipeline selector
    """
    tpot = TPOTClassifier(generations=config.TPOT_NUMBER_OF_GENERATIONS,
        population_size=config.TPOP_POPULATION_SIZE, n_jobs=config.TPOP_NUMBER_OF_JOBS,
        verbosity=config.TPOP_LOG_VERBOSITY, config_dict='TPOT sparse')
    pipeline_selector = PipelineSelector(dataset=dataset, metaclassifier=tpot)

    return pipeline_selector
Esempio n. 33
0
def test_generate_pipeline_code_2():
    """Assert that generate_pipeline_code() returns the correct code given a specific pipeline with two CombineDFs."""
    tpot_obj = TPOTClassifier()
    pipeline = [
        'KNeighborsClassifier',
        [
            'CombineDFs',
            [
                'GradientBoostingClassifier',
                'input_matrix',
                38.0,
                5,
                5,
                5,
                0.05,
                0.5],
            [
                'CombineDFs',
                [
                    'MinMaxScaler',
                    'input_matrix'
                ],
                ['ZeroCount',
                    [
                        'MaxAbsScaler',
                        'input_matrix'
                    ]
                ]
            ]
        ],
        18,
        'uniform',
        2
    ]

    expected_code = """make_pipeline(
    make_union(
        StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=38.0, max_depth=5, max_features=5, min_samples_leaf=5, min_samples_split=0.05, n_estimators=0.5)),
        make_union(
            MinMaxScaler(),
            make_pipeline(
                MaxAbsScaler(),
                ZeroCount()
            )
        )
    ),
    KNeighborsClassifier(n_neighbors=18, p="uniform", weights=2)
)"""

    assert expected_code == generate_pipeline_code(pipeline, tpot_obj.operators)
Esempio n. 34
0
def tune(X_train, X_test, y_train, y_test):
    # Construct and fit TPOT classifier
    start_time = time.time()
    tpot = TPOTClassifier(generations=10, verbosity=2)
    tpot.fit(X_train, y_train)
    end_time = time.time()

    # Results
    print('TPOT classifier finished in %s seconds' % (end_time - start_time))
    print('Best pipeline test accuracy: %.3f' % tpot.score(X_test, y_test))

    # Save best pipeline as Python script file
    tpot.export('tpot_pipeline.py')
Esempio n. 35
0
def do_tpot(generations=5, population_size=10, X='', y=''):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.80,
                                                        test_size=0.20)
    tpot = TPOTClassifier(generations=generations,
                          population_size=population_size,
                          verbosity=2,
                          cv=3)
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_pipeline.py')
    return tpot
Esempio n. 36
0
def tpot (X_train, y_train, X_test = None, y_test = None,
          export_file = '../results/models/tpot/exported_pipeline.py', n_jobs = 1):
    
    if 'node' and 'target' in X_train.columns:
        X_train = X_train.drop(columns = ['node', 'target'])
    if 'node' and 'target' in X_test.columns:
        X_test = X_test.drop(columns = ['node', 'target'])

    tpot = TPOTClassifier(generations = 5, population_size = 40, cv=3, verbosity=2, scoring = 'f1', n_jobs=6)

    tpot.fit(X_train, y_train)
    tpot.export(export_file)
    print(tpot.score(X_test, y_test))
Esempio n. 37
0
def check_export(op):
    """Assert that a TPOT operator exports as expected"""
    tpot_obj = TPOTClassifier(random_state=42)

    prng = np.random.RandomState(42)
    np.random.seed(42)

    args = []
    for type_ in op.parameter_types()[0][1:]:
        args.append(prng.choice(tpot_obj._pset.terminals[type_]).value)

    export_string = op.export(*args)

    assert export_string.startswith(op.__name__ +
                                    "(") and export_string.endswith(")")
Esempio n. 38
0
def main():
    df_train = pd.read_csv(os.getenv('PREPARED_TRAINING'))
    df_valid = pd.read_csv(os.getenv('PREPARED_VALIDATING'))
    df_test = pd.read_csv(os.getenv('PREPARED_TESTING'))

    feature_cols = list(df_train.columns[:-1])
    target_col = df_train.columns[-1]

    X_train = df_train[feature_cols].values
    y_train = df_train[target_col].values

    X_valid = df_valid[feature_cols].values
    y_valid = df_valid[target_col].values

    X_test = df_test[feature_cols].values

    prefix = os.getenv('STORING')
    tsne_data = np.load(os.path.join(prefix, 'tsne_2d_5p.npz'))
    tsne_train = tsne_data['train']
    tsne_valid = tsne_data['valid']
    tsne_test = tsne_data['test']

    # concat features
    X_train_concat = np.concatenate([X_train, tsne_train], axis=1)
    X_valid_concat = np.concatenate([X_valid, tsne_valid], axis=1)
    X_test_concat = np.concatenate([X_test, tsne_test], axis=1)

    tpot = TPOTClassifier(max_time_mins=int(os.getenv('TIME_LIMIT_ALL',
                                                      '1440')),
                          max_eval_time_mins=int(
                              os.getenv('TIME_LIMIT_PART', '5')),
                          population_size=100,
                          scoring='log_loss',
                          cv=3,
                          verbosity=2,
                          random_state=67)
    tpot.fit(X_train_concat, y_train)
    loss = tpot.score(X_valid_concat, y_valid)
    print(loss)
    tpot.export(os.path.join(prefix, 'tpot_pipeline.py'))

    p_test = tpot.predict_proba(X_test_concat)
    df_pred = pd.DataFrame({'id': df_test['id'], 'probability': p_test[:, 1]})
    csv_path = os.getenv('PREDICTING')
    df_pred.to_csv(csv_path, columns=('id', 'probability'), index=None)
    print('Saved: {}'.format(csv_path))
Esempio n. 39
0
    def fit(self, dataset, train_data):

        y_train = dataset.labels_from(train_data)

        if len(dataset.textual_columns) > 1:
            raise Exception(
                'Can only handle one textual column at the moment.')

        sparse_threshold = 0.3
        textual_column = []
        if len(dataset.textual_columns) > 0:
            sparse_threshold = 1.0
            textual_column = dataset.textual_columns[0]

        feature_transformation = ColumnTransformer(
            transformers=[
                ('categorical_features',
                 OneHotEncoder(handle_unknown='ignore'),
                 dataset.categorical_columns),
                ('scaled_numeric', StandardScaler(),
                 dataset.numerical_columns),
                ('textual_features',
                 HashingVectorizer(ngram_range=(1, 3),
                                   n_features=100000), textual_column),
            ],
            sparse_threshold=sparse_threshold)

        param_grid = {
            'learner__loss': ['log'],
            'learner__penalty': ['l2', 'l1', 'elasticnet'],
            'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
        }

        optimizer = TPOTClassifier(generations=5,
                                   population_size=20,
                                   cv=5,
                                   random_state=42,
                                   verbosity=2,
                                   config_dict='TPOT sparse',
                                   max_time_mins=2)

        pipeline = Pipeline([('features', feature_transformation),
                             ('learner', optimizer)])

        # search = GridSearchCV(pipeline, param_grid, scoring=self.scoring, cv=5, verbose=1, n_jobs=-1)
        model = pipeline.fit(train_data, y_train)

        return model
Esempio n. 40
0
    def __init__(self, dataset_path, json_path, n_jobs=1, config_dict=None, task="Classification"):
        self.scores = []
        self.datasets_path = dataset_path

        self.JSON = json_path

        if task == "Classification":
            self.tpot = TPOTClassifier(population_size=1, generations=0, verbosity=0,
                                       n_jobs=n_jobs, config_dict=config_dict, warm_start=True)
        elif task == "Regression":
            self.tpot = TPOTRegressor(population_size=1, generations=0, verbosity=0,
                                       n_jobs=n_jobs, config_dict=config_dict, warm_start=True)
        else:
            raise ValueError

        self.tpot._fit_init()  # Create _pset(PrimitiveSet)
Esempio n. 41
0
def find_best_model(X_train, X_test, y_train, y_test):
    pipeline_optimizer = TPOTClassifier(
        generations=100,
        population_size=50,
        cv=5,
        random_state=42,
        verbosity=2,
        config_dict='TPOT sparse'
    )

    pipeline_optimizer.fit(X_train, y_train)
    print(pipeline_optimizer.score(X_test, y_test))

    pipeline_optimizer.export('outputs/tpot_exported_pipeline.py')
Esempio n. 42
0
def tpot_train(cat, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        test_size=0.2)

    tpot = TPOTClassifier(generations=15,
                          population_size=20,
                          verbosity=5,
                          n_jobs=-1,
                          scoring='roc_auc')
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export(cat + '-pipeline.py')
Esempio n. 43
0
def train_model(pipeline_string, classi):
    tpot = TPOTClassifier()
    # if model is linearsvc then convert to svc
    # convert pipeline string to scikit-learn pipeline object
    deap_pipeline = creator.Individual.from_string(pipeline_string, tpot._pset)
    clf = tpot._toolbox.compile(expr=deap_pipeline)
    if classi == "LinearSVC":
        n = len(clf.steps)
        linsvc = str(clf.steps.pop(n - 1))
        match = re_search(r"C=(\d*.\d*)", linsvc)
        C_val = float(match.group(1))
        from sklearn.svm import SVC
        clf.steps.append(
            ('svc', SVC(kernel='linear', probability=True, C=C_val,
                        tol=1e-05)))
    return clf
Esempio n. 44
0
def test_mate_operator_stats_update():
    """Assert that self._mate_operator updates stats as expected."""
    tpot_obj = TPOTClassifier()
    ind1 = creator.Individual.from_string(
        'KNeighborsClassifier('
        'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=False),'
        'KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=1, '
        'KNeighborsClassifier__weights=uniform'
        ')', tpot_obj._pset)
    ind2 = creator.Individual.from_string(
        'KNeighborsClassifier('
        'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=True),'
        'KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=2, '
        'KNeighborsClassifier__weights=uniform'
        ')', tpot_obj._pset)

    initialize_stats_dict(ind1)
    initialize_stats_dict(ind2)

    # Randomly mutate the statistics
    ind1.statistics["crossover_count"] = random.randint(0, 10)
    ind1.statistics["mutation_count"] = random.randint(0, 10)
    ind2.statistics["crossover_count"] = random.randint(0, 10)
    ind2.statistics["mutation_count"] = random.randint(0, 10)

    # set as evaluated pipelines in tpot_obj.evaluated_individuals_
    tpot_obj.evaluated_individuals_[str(
        ind1)] = tpot_obj._combine_individual_stats(2, 0.99, ind1.statistics)
    tpot_obj.evaluated_individuals_[str(
        ind2)] = tpot_obj._combine_individual_stats(2, 0.99, ind2.statistics)

    # Doing 10 tests
    for _ in range(10):
        offspring1, offspring2 = tpot_obj._mate_operator(ind1, ind2)

        assert offspring1.statistics['crossover_count'] == ind1.statistics[
            'crossover_count'] + ind2.statistics['crossover_count'] + 1
        assert offspring1.statistics['mutation_count'] == ind1.statistics[
            'mutation_count'] + ind2.statistics['mutation_count']
        assert offspring1.statistics['predecessor'] == (str(ind1), str(ind2))

        # Offspring replaces on of the two predecessors
        # Don't need to worry about cloning
        if random.random() < 0.5:
            ind1 = offspring1
        else:
            ind2 = offspring1
Esempio n. 45
0
 def __init__(self, openML_id, scoring_function, memory_path = None, max_time=None):
     self.y_class_dict = None
     self.X_train, self.X_test, self.y_train, self.y_test = self.get_dataset(openML_id)
     if memory_path != None:
         if Path(memory_path).is_file():
             self.tpot = TPOTClassifier(memory=memory_path,warm_start=True,scoring=scoring_function,verbosity=3)
         else:
             self.tpot = TPOTClassifier(memory=memory_path,max_time_mins=max_time, scoring=scoring_function,verbosity=3)
     else:
         self.tpot = TPOTClassifier(max_time_mins=max_time, scoring=scoring_function,verbosity=3)
     self.tpot.fit(self.X_train,self.y_train)
Esempio n. 46
0
def model_chooser():
    if model_choice == '1':
        return LogisticRegression(random_state=0,
                                  solver='lbfgs',
                                  multi_class='multinomial')
    elif model_choice == '2':
        return MLPClassifier(hidden_layer_sizes=(10, 7, 5, 2))
    elif model_choice == '3':
        return RandomForestClassifier(n_estimators=100)
    elif model_choice == '4':
        import autosklearn.classification
        return autosklearn.classification.AutoSklearnClassifier()
    elif model_choice == '5':
        return TPOTClassifier(generation=5, population_size=20, verbosity=2)
    else:
        h2o.init()
        return H2ODeepLearningEstimator()
    def tpot_select_model(x_train, y_train, x_test, y_test):
        from tpot import TPOTClassifier

        # create instance
        tpot = TPOTClassifier(generations=10,
                              population_size=50,
                              verbosity=2,
                              n_jobs=-1)
        # fit instance
        tpot.fit(x_train, y_train)
        # evaluate performance on test data
        print(tpot.score(x_test, y_test))

        # export the script used to create the best model
        tpot.export('tpot_exported_pipeline.py')
Esempio n. 48
0
    def run(self, train_ratio=1.0):

        self.parse_spectra()

        if self.method == 'tpot':
            self.model = TPOTClassifier(generations=self.generations,
                                        population_size=self.pop_size,
                                        mutation_rate=0.9,
                                        crossover_rate=0.1,
                                        scoring='accuracy',
                                        cv=LeaveOneOut(),
                                        subsample=1.0,
                                        n_jobs=self.processes,
                                        max_eval_time_mins=5,
                                        random_state=None,
                                        verbosity=self.verbose,
                                        disable_update_check=True)
        elif self.method == 'tree':
            self.model = ExtraTreesClassifier()
        elif self.method == 'neat':
            self.model = NEATClassifier(generations=self.generations,
                                        population_size=self.pop_size,
                                        scoring='accuracy',
                                        n_jobs=self.processes,
                                        max_time_msec=45,
                                        verbosity=2)
        if self.pool:
            self.original_spectra = copy.deepcopy(self.spectra)
            self.__pool(self.pool)
        self.X, self.X_test, self.y, self.y_test = self.__create_dataset(
            train_ratio)
        if self.verbose > 0:
            print('Starting model cross-testing...')
        self.predictions = []
        for i in range(len(self.X)):
            X_test = self.X[i, :]
            y_test = self.y[i]
            X_train = np.delete(self.X, (i), axis=0)
            y_train = np.delete(self.y, (i), axis=0)
            self.model.fit(X_train, y_train)
            y_pred = self.model.predict([X_test])
            self.predictions.append(y_pred)
        self.explain_model()
        with open(join(self.output_dir, 'final_model.pkl'), 'wb') as f:
            f.write(self.model)
        print('Program finished.')
Esempio n. 49
0
def test_warm_start():
    """Assert that the TPOT warm_start flag stores the pop and pareto_front from the first run"""
    tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, warm_start=True)
    tpot_obj.fit(training_features, training_classes)

    assert tpot_obj._pop != None
    assert tpot_obj._pareto_front != None

    first_pop = tpot_obj._pop
    first_pareto_front = tpot_obj._pareto_front

    tpot_obj.random_state = 21
    tpot_obj.fit(training_features, training_classes)

    assert tpot_obj._pop == first_pop
Esempio n. 50
0
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset):
    tp = TPOTClassifier(verbosity=3)
    start_time = timer(None)
    tp.fit(X_train, y_train)
    tp.export('tpot_pipeline_dont_overfit.py')
    time = timer(start_time)
    preds = tp.predict(X_test)

    time_out = open(name_dataset + '_' + 'tpot', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({"id": id_test, "target": preds})

    submission.to_csv(name_dataset + '_' + 'tpot' + '_submission.csv',
                      index=False)
Esempio n. 51
0
    def fit(self, df, target, **fit_kwargs):
        """
        Train a TPOTRegressor or TPOTClassifier by fitting on a dataframe.

        Args:
            df (pandas.DataFrame): The df to be used for training.
            target (str): The key used to identify the machine learning target.
            **fit_kwargs: Keyword arguments to be passed to the TPOT backend.
                These arguments must be valid arguments to the TPOTBase class.

        Returns:
            TPOTAdaptor (self)

        """
        # Prevent goofy pandas casting by casting to native
        y = df[target].values
        X = df.drop(columns=target).values

        # Determine learning type based on whether classification or regression
        self.mode = regression_or_classification(df[target])

        mltype_str = "Classifier" if self.mode == AMM_CLF_NAME else "Regressor"
        self.tpot_kwargs["template"] = self.tpot_kwargs.get(
            "template", "Selector-Transformer-{}".format(mltype_str))

        if self.mode == AMM_CLF_NAME:
            self.tpot_kwargs["config_dict"] = self.tpot_kwargs.get(
                "config_dict", TPOT_CLASSIFIER_CONFIG)
            if "scoring" not in self.tpot_kwargs:
                self.tpot_kwargs["scoring"] = "balanced_accuracy"
            self._backend = TPOTClassifier(**self.tpot_kwargs)
        elif self.mode == AMM_REG_NAME:
            self.tpot_kwargs["config_dict"] = self.tpot_kwargs.get(
                "config_dict", TPOT_REGRESSOR_CONFIG)
            if "scoring" not in self.tpot_kwargs:
                self.tpot_kwargs["scoring"] = "neg_mean_absolute_error"
            self._backend = TPOTRegressor(**self.tpot_kwargs)
        else:
            raise ValueError("Learning type {} not recognized as a valid mode "
                             "for {}".format(self.mode,
                                             self.__class__.__name__))
        self._features = df.drop(columns=target).columns.tolist()
        self._fitted_target = target
        self._backend = self._backend.fit(X, y, **fit_kwargs)
        return self
Esempio n. 52
0
def classification():
    digits = load_digits()
    X_train, X_test, y_train, y_test = train_test_split(digits.data,
                                                        digits.target,
                                                        train_size=0.75,
                                                        test_size=0.25,
                                                        random_state=42)

    tpot = TPOTClassifier(generations=5,
                          population_size=50,
                          verbosity=2,
                          random_state=42)
    tpot.fit(X_train, y_train)

    print(tpot.score(X_test, y_test))
    tpot.export('tpot_digits_pipeline.py')
Esempio n. 53
0
def tpot_optimization_clf(count, train_path, test_path, verbose=False):
    """
    Optimize algorithms and parameters using TPOT for Classification trees.

    :param count: int, number of samples to be generated.
    :param train_path: string, path to the dataset used for training.
    :param test_path: string, path to the dataset used for testing.
    :param verbose: bool, representing if information regarding the process should be displayed.
    """

    # Generate samples.
    if verbose: print("Get train samples. ")
    X_train, Y_train = Sampler.generate_samples(dataset=train_path,
                                                count=count)
    if verbose: print("Get test samples. ")
    X_test, Y_test = Sampler.generate_samples(dataset=test_path, count=count)

    tpot_config = {
        'xgboost.XGBClassifier': {
            'max_depth': [2, 3, 4, 5],
            "learning_rate": [0.02, 0.05, 0.1, 0.15, 0.2],
            'n_estimators': [10, 20, 30, 40, 50, 100, 500],
            'objective': ["reg:linear", "multi:softmax", "multi:softprob"],
            'booster': ["gbtree", "gblinear", "dart"],
            'n_jobs': [-1]
        },
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 20, 30, 40, 50, 100, 500],
            'criterion': ["gini", "entropy"],
            'max_features': ["auto", "sqrt", "log2"],
            'max_depth': [2, 3, 4, 5],
            'n_jobs': [-1]
        }
    }

    if verbose: print("Start TPOT optimization. ")

    tpot = TPOTClassifier(generations=10,
                          population_size=30,
                          verbosity=2,
                          config_dict=tpot_config)

    tpot.fit(np.array(X_train), np.array(Y_train))
    print(
        tpot.score(np.array(X_test, dtype=np.float64),
                   np.array(Y_test, dtype=np.float64)))
    tpot.export('tpot_pipeline_clf.py')
Esempio n. 54
0
File: tests.py Progetto: val922/tpot
def test_export_pipeline():
    """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline"""
    tpot_obj = TPOTClassifier()
    pipeline_string = (
        'KNeighborsClassifier(CombineDFs('
        'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
        ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
        'DecisionTreeClassifier__min_samples_split=5),SelectKBest(input_matrix, SelectKBest__k=20)'
        'KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform')
    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)

    expected_code = """import numpy as np

from copy import copy
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \\
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    make_union(
        make_union(VotingClassifier([('branch',
            DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)
        )]), FunctionTransformer(copy)),
        SelectKBest(score_func=f_classif, k=20)
    ),
    KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform")
)

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
"""
    assert expected_code == export_pipeline(pipeline, tpot_obj.operators,
                                            tpot_obj._pset)
Esempio n. 55
0
def test_warm_start():
    """Assert that the TPOT warm_start flag stores the pop and pareto_front from the first run"""
    tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, warm_start=True)
    tpot_obj.fit(training_features, training_classes)

    assert tpot_obj._pop != None
    assert tpot_obj._pareto_front != None

    first_pop = tpot_obj._pop
    first_pareto_front = tpot_obj._pareto_front

    tpot_obj.random_state = 21
    tpot_obj.fit(training_features, training_classes)

    assert tpot_obj._pop == first_pop
Esempio n. 56
0
test_operator_key_2 = 'sklearn.feature_selection.SelectFromModel'
TPOTSelectPercentile, TPOTSelectPercentile_args = TPOTOperatorClassFactory(
    test_operator_key_1,
    classifier_config_dict[test_operator_key_1]
)

TPOTSelectFromModel, TPOTSelectFromModel_args = TPOTOperatorClassFactory(
    test_operator_key_2,
    classifier_config_dict[test_operator_key_2]
)

mnist_data = load_digits()
training_features, testing_features, training_target, testing_target = \
    train_test_split(mnist_data.data.astype(np.float64), mnist_data.target.astype(np.float64), random_state=42)

tpot_obj = TPOTClassifier()
tpot_obj._fit_init()

tpot_obj_reg = TPOTRegressor()
tpot_obj_reg._fit_init()

def test_export_random_ind():
    """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39."""
    tpot_obj = TPOTClassifier(random_state=39, config_dict="TPOT light")
    tpot_obj._fit_init()
    tpot_obj._pbar = tqdm(total=1, disable=True)
    pipeline = tpot_obj._toolbox.individual()
    expected_code = """import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
Esempio n. 57
0
def test_set_params():
    """Assert that set_params returns a reference to the TPOT instance"""

    tpot_obj = TPOTClassifier()
    assert tpot_obj.set_params() is tpot_obj
Esempio n. 58
0
def test_set_params_2():
    """Assert that set_params updates TPOT's instance variables"""
    tpot_obj = TPOTClassifier(generations=2)
    tpot_obj.set_params(generations=3)

    assert tpot_obj.generations == 3
Esempio n. 59
0
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

digits = load_digits()

X_train, X_test, y_train,  y_test = train_test_split(digits.data, digits.target,
													train_size = 0.75, test_size = 0.25)

tpot = TPOTClassifier(generations = 5, population_size = 20, verbosity = 2)

tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_mnist_pipeline.py')
Esempio n. 60
0
def generate_model(generations, train_X, train_y):
	tpot_generator = TPOTClassifier(generations=generations, verbosity=2)
	tpot_generator.fit(train_X, train_y)
	tpot_generator.export('tpot_model' + generations + '.py')