def test_mut_operator_stats_update(): """Asserts that self._random_mutation_operator updates stats as expected.""" tpot_obj = TPOTClassifier() ind = creator.Individual.from_string( 'KNeighborsClassifier(' 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=False),' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1, ' 'KNeighborsClassifier__weights=uniform' ')', tpot_obj._pset ) initialize_stats_dict(ind) ind.statistics["crossover_count"] = random.randint(0, 10) ind.statistics["mutation_count"] = random.randint(0, 10) # set as evaluated pipelines in tpot_obj.evaluated_individuals_ tpot_obj.evaluated_individuals_[str(ind)] = tpot_obj._combine_individual_stats(2, 0.99, ind.statistics) for _ in range(10): offspring, = tpot_obj._random_mutation_operator(ind) assert offspring.statistics['crossover_count'] == ind.statistics['crossover_count'] assert offspring.statistics['mutation_count'] == ind.statistics['mutation_count'] + 1 assert offspring.statistics['predecessor'] == (str(ind),) ind = offspring
def test_fit2(): """Assert that the TPOT fit function provides an optimized pipeline when config_dict is \'TPOT light\'""" tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, config_dict='TPOT light') tpot_obj.fit(training_features, training_classes) assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) assert not (tpot_obj._start_datetime is None)
def test_random_ind_2(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 45""" tpot_obj = TPOTClassifier(random_state=45) tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from tpot.built_in_operators import ZeroCount # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \\ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( ZeroCount(), LogisticRegression(C=0.0001, dual=False, penalty="l2") ) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def test_export_random_ind(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39.""" tpot_obj = TPOTClassifier(random_state=39) tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=42) exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=65), DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=4, min_samples_split=18) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def test_pipeline_score_save(): """Assert that the TPOTClassifier can generate a scored pipeline export correctly.""" tpot_obj = TPOTClassifier() tpot_obj._fit_init() tpot_obj._pbar = tqdm(total=1, disable=True) pipeline_string = ( 'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),' 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:0.929813743 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=20), DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert_equal(expected_code, export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, pipeline_score=0.929813743))
def test_fit(): """Assert that the TPOT fit function provides an optimized pipeline""" tpot_obj = TPOTClassifier(random_state=42, population_size=1, generations=1, verbosity=0) tpot_obj.fit(training_features, training_classes) assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) assert tpot_obj._gp_generation == 0 assert not (tpot_obj._start_datetime is None)
def test_gen(): """Assert that TPOT's gen_grow_safe function returns a pipeline of expected structure""" tpot_obj = TPOTClassifier() pipeline = tpot_obj._gen_grow_safe(tpot_obj._pset, 1, 3) assert len(pipeline) > 1 assert pipeline[0].ret == Output_DF
def test_invaild_dataset_warning(): """Assert that the TPOT fit function raises a ValueError when dataset is not in right format""" tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0) bad_training_classes = training_classes.reshape((1, len(training_classes)))# common mistake in classes try: tpot_obj.fit(training_features ,bad_training_classes) # typo for balanced_accuracy assert False except ValueError: pass
def test_export(): """Assert that TPOT's export function throws a ValueError when no optimized pipeline exists""" tpot_obj = TPOTClassifier() try: tpot_obj.export("test_export.py") assert False # Should be unreachable except ValueError: pass
def test_predict(): """Assert that the TPOT predict function raises a ValueError when no optimized pipeline exists""" tpot_obj = TPOTClassifier() try: tpot_obj.predict(testing_features) assert False # Should be unreachable except ValueError: pass
def test_score(): """Assert that the TPOT score function raises a RuntimeError when no optimized pipeline exists""" tpot_obj = TPOTClassifier() try: tpot_obj.score(testing_features, testing_classes) assert False # Should be unreachable except RuntimeError: pass
def test_predict_2(): """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)""" tpot_obj = TPOTClassifier() tpot_obj._optimized_pipeline = creator.Individual.\ from_string('DecisionTreeClassifier(input_matrix)', tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) result = tpot_obj.predict(testing_features) assert result.shape == (testing_features.shape[0],)
def test_dict_initialization(): """Asserts that gp_deap.initialize_stats_dict initializes individual statistics correctly""" tpot_obj = TPOTClassifier() tpot_obj._fit_init() tb = tpot_obj._toolbox test_ind = tb.individual() initialize_stats_dict(test_ind) assert test_ind.statistics['generation'] == 0 assert test_ind.statistics['crossover_count'] == 0 assert test_ind.statistics['mutation_count'] == 0 assert test_ind.statistics['predecessor'] == ('ROOT',)
def test_imputer_in_export(): """Assert that TPOT exports a pipeline with an imputation step if imputation was used in fit().""" tpot_obj = TPOTClassifier( random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, config_dict='TPOT light' ) features_with_nan = np.copy(training_features) features_with_nan[0][0] = float('nan') tpot_obj.fit(features_with_nan, training_target) # use fixed pipeline since the random.seed() performs differently in python 2.* and 3.* pipeline_string = ( 'KNeighborsClassifier(' 'input_matrix, ' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1, ' 'KNeighborsClassifier__weights=uniform' ')' ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) export_code = export_pipeline(tpot_obj._optimized_pipeline, tpot_obj.operators, tpot_obj._pset, tpot_obj._imputed) expected_code = """import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import Imputer # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=None) imputer = Imputer(strategy="median") imputer.fit(training_features) training_features = imputer.transform(training_features) testing_features = imputer.transform(testing_features) exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert_equal(export_code, expected_code)
def test_predict_2(): """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)""" tpot_obj = TPOTClassifier() pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5)') tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) result = tpot_obj.predict(testing_features) assert result.shape == (testing_features.shape[0],)
def test_mate_operator_stats_update(): """Assert that self._mate_operator updates stats as expected.""" tpot_obj = TPOTClassifier() ind1 = creator.Individual.from_string( 'KNeighborsClassifier(' 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=False),' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1, ' 'KNeighborsClassifier__weights=uniform' ')', tpot_obj._pset ) ind2 = creator.Individual.from_string( 'KNeighborsClassifier(' 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=True),' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=2, ' 'KNeighborsClassifier__weights=uniform' ')', tpot_obj._pset ) initialize_stats_dict(ind1) initialize_stats_dict(ind2) # Randomly mutate the statistics ind1.statistics["crossover_count"] = random.randint(0, 10) ind1.statistics["mutation_count"] = random.randint(0, 10) ind2.statistics["crossover_count"] = random.randint(0, 10) ind2.statistics["mutation_count"] = random.randint(0, 10) # set as evaluated pipelines in tpot_obj.evaluated_individuals_ tpot_obj.evaluated_individuals_[str(ind1)] = tpot_obj._combine_individual_stats(2, 0.99, ind1.statistics) tpot_obj.evaluated_individuals_[str(ind2)] = tpot_obj._combine_individual_stats(2, 0.99, ind2.statistics) # Doing 10 tests for _ in range(10): offspring1, offspring2 = tpot_obj._mate_operator(ind1, ind2) assert offspring1.statistics['crossover_count'] == ind1.statistics['crossover_count'] + ind2.statistics['crossover_count'] + 1 assert offspring1.statistics['mutation_count'] == ind1.statistics['mutation_count'] + ind2.statistics['mutation_count'] assert offspring1.statistics['predecessor'] == (str(ind1), str(ind2)) # Offspring replaces on of the two predecessors # Don't need to worry about cloning if random.random() < 0.5: ind1 = offspring1 else: ind2 = offspring1
def test_gp_new_generation(): """Assert that the gp_generation count gets incremented when _gp_new_generation is called""" tpot_obj = TPOTClassifier() tpot_obj._pbar = tqdm(total=1, disable=True) assert tpot_obj._gp_generation == 0 # Since _gp_new_generation is a decorator, and we dont want to run a full # fit(), decorate a dummy function and then call the dummy function. @_gp_new_generation def dummy_function(self, foo): pass dummy_function(tpot_obj, None) assert tpot_obj._gp_generation == 1
def test_get_params(): """Assert that get_params returns the exact dictionary of parameters used by TPOT""" kwargs = { 'population_size': 500, 'generations': 1000, 'verbosity': 1 } tpot_obj = TPOTClassifier(**kwargs) # Get default parameters of TPOT and merge with our specified parameters initializer = inspect.getargspec(TPOTBase.__init__) default_kwargs = dict(zip(initializer.args[1:], initializer.defaults)) default_kwargs.update(kwargs) assert tpot_obj.get_params() == default_kwargs
def test_export(): """Assert that TPOT's export function throws a RuntimeError when no optimized pipeline exists.""" tpot_obj = TPOTClassifier() assert_raises(RuntimeError, tpot_obj.export, "test_export.py") pipeline_string = ( 'KNeighborsClassifier(CombineDFs(' 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5), ZeroCount(input_matrix))' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._optimized_pipeline = pipeline tpot_obj.export("test_export.py") assert path.isfile("test_export.py") remove("test_export.py") # clean up exported file
def test_get_params(): """Assert that get_params returns the exact dictionary of parameters used by TPOT""" kwargs = { 'population_size': 500, 'generations': 1000, 'config_dict': 'TPOT light', 'offspring_size': 2000, 'verbosity': 1 } tpot_obj = TPOTClassifier(**kwargs) # Get default parameters of TPOT and merge with our specified parameters initializer = inspect.getargspec(TPOTBase.__init__) default_kwargs = dict(zip(initializer.args[1:], initializer.defaults)) default_kwargs.update(kwargs) # update to dictionary instead of input string default_kwargs.update({'config_dict': classifier_config_dict_light}) assert tpot_obj.get_params()['config_dict'] == default_kwargs['config_dict'] assert tpot_obj.get_params() == default_kwargs
def test_score_2(): """Assert that the TPOTClassifier score function outputs a known score for a fix pipeline""" tpot_obj = TPOTClassifier() known_score = 0.977777777778 # Assumes use of the TPOT balanced_accuracy function # Reify pipeline with known score pipeline_string= ('KNeighborsClassifier(input_matrix, KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform)') tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) # http://stackoverflow.com/questions/5595425/ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) assert isclose(known_score, score)
def test_score_2(): """Assert that the TPOTClassifier score function outputs a known score for a fixed pipeline""" tpot_obj = TPOTClassifier() tpot_obj._pbar = tqdm(total=1, disable=True) known_score = 0.986318199045 # Assumes use of the TPOT balanced_accuracy function # Reify pipeline with known score tpot_obj._optimized_pipeline = creator.Individual.\ from_string('RandomForestClassifier(input_matrix)', tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) # http://stackoverflow.com/questions/5595425/ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) assert isclose(known_score, score)
def test_export_random_ind(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39.""" tpot_obj = TPOTClassifier(random_state=39, config_dict="TPOT light") tpot_obj._fit_init() tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=39) exported_pipeline = BernoulliNB(alpha=1.0, fit_prior=False) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, random_state=tpot_obj.random_state)
def test_predict_proba2(): """Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float)""" tpot_obj = TPOTClassifier() pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5)') tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) result = tpot_obj.predict_proba(testing_features) rows = result.shape[0] columns = result.shape[1] try: for i in range(rows): for j in range(columns): float_range(result[i][j]) assert True except Exception: assert False
def test_set_param_recursive_3(): """Assert that set_param_recursive sets \"random_state\" to 42 in nested estimator in StackingEstimator in a complex pipeline.""" pipeline_string = ( 'DecisionTreeClassifier(CombineDFs(' 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' 'DecisionTreeClassifier__max_depth=8, DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5),input_matrix) ' 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8, ' 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' ) tpot_obj = TPOTClassifier() tpot_obj._fit_init() deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) # StackingEstimator under the transformer_list of FeatureUnion assert getattr( getattr(sklearn_pipeline.steps[0][1].transformer_list[0][1], 'estimator'), 'random_state') == 42 assert getattr(sklearn_pipeline.steps[1][1], 'random_state') == 42
def test_generate_import_code(): """Assert that generate_import_code() returns the correct set of dependancies for a given pipeline.""" tpot_obj = TPOTClassifier() pipeline = creator.Individual.from_string( 'GaussianNB(RobustScaler(input_matrix))', tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler """ assert expected_code == generate_import_code(pipeline, tpot_obj.operators)
def Classifier(x, y): x_train = x y_train = y tpot = TPOTClassifier( verbosity=2, max_time_mins=10, population_size=50, ) tpot.fit(x_train, y_train) tpot.export('tpot_pipeline.py') TPOT_predict = tpot.predict(x_test) score = tpot.score(x_test, y_test) #print(score) #print(y_test) #print(TPOT_predict) return score
def test_score_2(): """Assert that the TPOTClassifier score function outputs a known score for a fix pipeline""" tpot_obj = TPOTClassifier() known_score = 0.977777777778 # Assumes use of the TPOT balanced_accuracy function # Reify pipeline with known score pipeline_string = ( 'KNeighborsClassifier(input_matrix, KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform)') tpot_obj._optimized_pipeline = creator.Individual.from_string( pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile( expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) # http://stackoverflow.com/questions/5595425/ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) assert isclose(known_score, score)
class Tpot_example(Strategy): name = 'tpotExample' # We define out own parameters # machine learning classiffier tpotClassifier = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) treshold = 0.5 currentPosition = 0 ### ********************** ### Override methods def init(self, dictionaryParameters, trainSet=None): if trainSet is None: raise Exception('Tpot algo requires a train set!') xTrain, yTrain = self.prepareDatasetForMachineLarning(trainSet) self.tpotClassifier.fit(xTrain, yTrain) trainingScore = self.tpotClassifier.score(xTrain, yTrain) print 'Finished tpotTraining with score %f' % trainingScore def onBar(self, bar): x, y = self.prepareDatasetForMachineLarning(bar) output = self.tpotClassifier.predict(x) # We buy only to exit a sell position (position<0 ) or in not position (position =0) if output > self.treshold and self.currentPosition <= 0: print 'Buy' self.currentPosition += 1 # We sell only to exit a buy position (position>0 ) or in not position (position =0) elif output < self.treshold and self.currentPosition >= 0: print 'Sell' self.currentPosition -= 1 else: print 'Nothing: current output %f' % self.currentPosition return Strategy.onBar(self, bar) ### ********************** ### custom methods def prepareDatasetForMachineLarning(self, dataframeToPrepare): # input set input = dataframeToPrepare[['close', 'open', 'volume']] # target set output = dataframeToPrepare['close'].diff() return input, output
def create_pipeline_selector(dataset:Dataset): """ Create the PipelineSelector object responsible for executing the combinations in order to detect the best Pipeline according the provided configuration :param dataset: The Dataset object to be used by the PipelineSelector :return PipelineSelector: The Pipeline selector """ tpot = TPOTClassifier(generations=config.TPOT_NUMBER_OF_GENERATIONS, population_size=config.TPOP_POPULATION_SIZE, n_jobs=config.TPOP_NUMBER_OF_JOBS, verbosity=config.TPOP_LOG_VERBOSITY, config_dict='TPOT sparse') pipeline_selector = PipelineSelector(dataset=dataset, metaclassifier=tpot) return pipeline_selector
def test_generate_pipeline_code_2(): """Assert that generate_pipeline_code() returns the correct code given a specific pipeline with two CombineDFs.""" tpot_obj = TPOTClassifier() pipeline = [ 'KNeighborsClassifier', [ 'CombineDFs', [ 'GradientBoostingClassifier', 'input_matrix', 38.0, 5, 5, 5, 0.05, 0.5], [ 'CombineDFs', [ 'MinMaxScaler', 'input_matrix' ], ['ZeroCount', [ 'MaxAbsScaler', 'input_matrix' ] ] ] ], 18, 'uniform', 2 ] expected_code = """make_pipeline( make_union( StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=38.0, max_depth=5, max_features=5, min_samples_leaf=5, min_samples_split=0.05, n_estimators=0.5)), make_union( MinMaxScaler(), make_pipeline( MaxAbsScaler(), ZeroCount() ) ) ), KNeighborsClassifier(n_neighbors=18, p="uniform", weights=2) )""" assert expected_code == generate_pipeline_code(pipeline, tpot_obj.operators)
def tune(X_train, X_test, y_train, y_test): # Construct and fit TPOT classifier start_time = time.time() tpot = TPOTClassifier(generations=10, verbosity=2) tpot.fit(X_train, y_train) end_time = time.time() # Results print('TPOT classifier finished in %s seconds' % (end_time - start_time)) print('Best pipeline test accuracy: %.3f' % tpot.score(X_test, y_test)) # Save best pipeline as Python script file tpot.export('tpot_pipeline.py')
def do_tpot(generations=5, population_size=10, X='', y=''): X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, test_size=0.20) tpot = TPOTClassifier(generations=generations, population_size=population_size, verbosity=2, cv=3) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_pipeline.py') return tpot
def tpot (X_train, y_train, X_test = None, y_test = None, export_file = '../results/models/tpot/exported_pipeline.py', n_jobs = 1): if 'node' and 'target' in X_train.columns: X_train = X_train.drop(columns = ['node', 'target']) if 'node' and 'target' in X_test.columns: X_test = X_test.drop(columns = ['node', 'target']) tpot = TPOTClassifier(generations = 5, population_size = 40, cv=3, verbosity=2, scoring = 'f1', n_jobs=6) tpot.fit(X_train, y_train) tpot.export(export_file) print(tpot.score(X_test, y_test))
def check_export(op): """Assert that a TPOT operator exports as expected""" tpot_obj = TPOTClassifier(random_state=42) prng = np.random.RandomState(42) np.random.seed(42) args = [] for type_ in op.parameter_types()[0][1:]: args.append(prng.choice(tpot_obj._pset.terminals[type_]).value) export_string = op.export(*args) assert export_string.startswith(op.__name__ + "(") and export_string.endswith(")")
def main(): df_train = pd.read_csv(os.getenv('PREPARED_TRAINING')) df_valid = pd.read_csv(os.getenv('PREPARED_VALIDATING')) df_test = pd.read_csv(os.getenv('PREPARED_TESTING')) feature_cols = list(df_train.columns[:-1]) target_col = df_train.columns[-1] X_train = df_train[feature_cols].values y_train = df_train[target_col].values X_valid = df_valid[feature_cols].values y_valid = df_valid[target_col].values X_test = df_test[feature_cols].values prefix = os.getenv('STORING') tsne_data = np.load(os.path.join(prefix, 'tsne_2d_5p.npz')) tsne_train = tsne_data['train'] tsne_valid = tsne_data['valid'] tsne_test = tsne_data['test'] # concat features X_train_concat = np.concatenate([X_train, tsne_train], axis=1) X_valid_concat = np.concatenate([X_valid, tsne_valid], axis=1) X_test_concat = np.concatenate([X_test, tsne_test], axis=1) tpot = TPOTClassifier(max_time_mins=int(os.getenv('TIME_LIMIT_ALL', '1440')), max_eval_time_mins=int( os.getenv('TIME_LIMIT_PART', '5')), population_size=100, scoring='log_loss', cv=3, verbosity=2, random_state=67) tpot.fit(X_train_concat, y_train) loss = tpot.score(X_valid_concat, y_valid) print(loss) tpot.export(os.path.join(prefix, 'tpot_pipeline.py')) p_test = tpot.predict_proba(X_test_concat) df_pred = pd.DataFrame({'id': df_test['id'], 'probability': p_test[:, 1]}) csv_path = os.getenv('PREDICTING') df_pred.to_csv(csv_path, columns=('id', 'probability'), index=None) print('Saved: {}'.format(csv_path))
def fit(self, dataset, train_data): y_train = dataset.labels_from(train_data) if len(dataset.textual_columns) > 1: raise Exception( 'Can only handle one textual column at the moment.') sparse_threshold = 0.3 textual_column = [] if len(dataset.textual_columns) > 0: sparse_threshold = 1.0 textual_column = dataset.textual_columns[0] feature_transformation = ColumnTransformer( transformers=[ ('categorical_features', OneHotEncoder(handle_unknown='ignore'), dataset.categorical_columns), ('scaled_numeric', StandardScaler(), dataset.numerical_columns), ('textual_features', HashingVectorizer(ngram_range=(1, 3), n_features=100000), textual_column), ], sparse_threshold=sparse_threshold) param_grid = { 'learner__loss': ['log'], 'learner__penalty': ['l2', 'l1', 'elasticnet'], 'learner__alpha': [0.0001, 0.001, 0.01, 0.1] } optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2, config_dict='TPOT sparse', max_time_mins=2) pipeline = Pipeline([('features', feature_transformation), ('learner', optimizer)]) # search = GridSearchCV(pipeline, param_grid, scoring=self.scoring, cv=5, verbose=1, n_jobs=-1) model = pipeline.fit(train_data, y_train) return model
def __init__(self, dataset_path, json_path, n_jobs=1, config_dict=None, task="Classification"): self.scores = [] self.datasets_path = dataset_path self.JSON = json_path if task == "Classification": self.tpot = TPOTClassifier(population_size=1, generations=0, verbosity=0, n_jobs=n_jobs, config_dict=config_dict, warm_start=True) elif task == "Regression": self.tpot = TPOTRegressor(population_size=1, generations=0, verbosity=0, n_jobs=n_jobs, config_dict=config_dict, warm_start=True) else: raise ValueError self.tpot._fit_init() # Create _pset(PrimitiveSet)
def find_best_model(X_train, X_test, y_train, y_test): pipeline_optimizer = TPOTClassifier( generations=100, population_size=50, cv=5, random_state=42, verbosity=2, config_dict='TPOT sparse' ) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('outputs/tpot_exported_pipeline.py')
def tpot_train(cat, X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2) tpot = TPOTClassifier(generations=15, population_size=20, verbosity=5, n_jobs=-1, scoring='roc_auc') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export(cat + '-pipeline.py')
def train_model(pipeline_string, classi): tpot = TPOTClassifier() # if model is linearsvc then convert to svc # convert pipeline string to scikit-learn pipeline object deap_pipeline = creator.Individual.from_string(pipeline_string, tpot._pset) clf = tpot._toolbox.compile(expr=deap_pipeline) if classi == "LinearSVC": n = len(clf.steps) linsvc = str(clf.steps.pop(n - 1)) match = re_search(r"C=(\d*.\d*)", linsvc) C_val = float(match.group(1)) from sklearn.svm import SVC clf.steps.append( ('svc', SVC(kernel='linear', probability=True, C=C_val, tol=1e-05))) return clf
def test_mate_operator_stats_update(): """Assert that self._mate_operator updates stats as expected.""" tpot_obj = TPOTClassifier() ind1 = creator.Individual.from_string( 'KNeighborsClassifier(' 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=False),' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1, ' 'KNeighborsClassifier__weights=uniform' ')', tpot_obj._pset) ind2 = creator.Individual.from_string( 'KNeighborsClassifier(' 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=True),' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=2, ' 'KNeighborsClassifier__weights=uniform' ')', tpot_obj._pset) initialize_stats_dict(ind1) initialize_stats_dict(ind2) # Randomly mutate the statistics ind1.statistics["crossover_count"] = random.randint(0, 10) ind1.statistics["mutation_count"] = random.randint(0, 10) ind2.statistics["crossover_count"] = random.randint(0, 10) ind2.statistics["mutation_count"] = random.randint(0, 10) # set as evaluated pipelines in tpot_obj.evaluated_individuals_ tpot_obj.evaluated_individuals_[str( ind1)] = tpot_obj._combine_individual_stats(2, 0.99, ind1.statistics) tpot_obj.evaluated_individuals_[str( ind2)] = tpot_obj._combine_individual_stats(2, 0.99, ind2.statistics) # Doing 10 tests for _ in range(10): offspring1, offspring2 = tpot_obj._mate_operator(ind1, ind2) assert offspring1.statistics['crossover_count'] == ind1.statistics[ 'crossover_count'] + ind2.statistics['crossover_count'] + 1 assert offspring1.statistics['mutation_count'] == ind1.statistics[ 'mutation_count'] + ind2.statistics['mutation_count'] assert offspring1.statistics['predecessor'] == (str(ind1), str(ind2)) # Offspring replaces on of the two predecessors # Don't need to worry about cloning if random.random() < 0.5: ind1 = offspring1 else: ind2 = offspring1
def __init__(self, openML_id, scoring_function, memory_path = None, max_time=None): self.y_class_dict = None self.X_train, self.X_test, self.y_train, self.y_test = self.get_dataset(openML_id) if memory_path != None: if Path(memory_path).is_file(): self.tpot = TPOTClassifier(memory=memory_path,warm_start=True,scoring=scoring_function,verbosity=3) else: self.tpot = TPOTClassifier(memory=memory_path,max_time_mins=max_time, scoring=scoring_function,verbosity=3) else: self.tpot = TPOTClassifier(max_time_mins=max_time, scoring=scoring_function,verbosity=3) self.tpot.fit(self.X_train,self.y_train)
def model_chooser(): if model_choice == '1': return LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial') elif model_choice == '2': return MLPClassifier(hidden_layer_sizes=(10, 7, 5, 2)) elif model_choice == '3': return RandomForestClassifier(n_estimators=100) elif model_choice == '4': import autosklearn.classification return autosklearn.classification.AutoSklearnClassifier() elif model_choice == '5': return TPOTClassifier(generation=5, population_size=20, verbosity=2) else: h2o.init() return H2ODeepLearningEstimator()
def tpot_select_model(x_train, y_train, x_test, y_test): from tpot import TPOTClassifier # create instance tpot = TPOTClassifier(generations=10, population_size=50, verbosity=2, n_jobs=-1) # fit instance tpot.fit(x_train, y_train) # evaluate performance on test data print(tpot.score(x_test, y_test)) # export the script used to create the best model tpot.export('tpot_exported_pipeline.py')
def run(self, train_ratio=1.0): self.parse_spectra() if self.method == 'tpot': self.model = TPOTClassifier(generations=self.generations, population_size=self.pop_size, mutation_rate=0.9, crossover_rate=0.1, scoring='accuracy', cv=LeaveOneOut(), subsample=1.0, n_jobs=self.processes, max_eval_time_mins=5, random_state=None, verbosity=self.verbose, disable_update_check=True) elif self.method == 'tree': self.model = ExtraTreesClassifier() elif self.method == 'neat': self.model = NEATClassifier(generations=self.generations, population_size=self.pop_size, scoring='accuracy', n_jobs=self.processes, max_time_msec=45, verbosity=2) if self.pool: self.original_spectra = copy.deepcopy(self.spectra) self.__pool(self.pool) self.X, self.X_test, self.y, self.y_test = self.__create_dataset( train_ratio) if self.verbose > 0: print('Starting model cross-testing...') self.predictions = [] for i in range(len(self.X)): X_test = self.X[i, :] y_test = self.y[i] X_train = np.delete(self.X, (i), axis=0) y_train = np.delete(self.y, (i), axis=0) self.model.fit(X_train, y_train) y_pred = self.model.predict([X_test]) self.predictions.append(y_pred) self.explain_model() with open(join(self.output_dir, 'final_model.pkl'), 'wb') as f: f.write(self.model) print('Program finished.')
def test_warm_start(): """Assert that the TPOT warm_start flag stores the pop and pareto_front from the first run""" tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, warm_start=True) tpot_obj.fit(training_features, training_classes) assert tpot_obj._pop != None assert tpot_obj._pareto_front != None first_pop = tpot_obj._pop first_pareto_front = tpot_obj._pareto_front tpot_obj.random_state = 21 tpot_obj.fit(training_features, training_classes) assert tpot_obj._pop == first_pop
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset): tp = TPOTClassifier(verbosity=3) start_time = timer(None) tp.fit(X_train, y_train) tp.export('tpot_pipeline_dont_overfit.py') time = timer(start_time) preds = tp.predict(X_test) time_out = open(name_dataset + '_' + 'tpot', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({"id": id_test, "target": preds}) submission.to_csv(name_dataset + '_' + 'tpot' + '_submission.csv', index=False)
def fit(self, df, target, **fit_kwargs): """ Train a TPOTRegressor or TPOTClassifier by fitting on a dataframe. Args: df (pandas.DataFrame): The df to be used for training. target (str): The key used to identify the machine learning target. **fit_kwargs: Keyword arguments to be passed to the TPOT backend. These arguments must be valid arguments to the TPOTBase class. Returns: TPOTAdaptor (self) """ # Prevent goofy pandas casting by casting to native y = df[target].values X = df.drop(columns=target).values # Determine learning type based on whether classification or regression self.mode = regression_or_classification(df[target]) mltype_str = "Classifier" if self.mode == AMM_CLF_NAME else "Regressor" self.tpot_kwargs["template"] = self.tpot_kwargs.get( "template", "Selector-Transformer-{}".format(mltype_str)) if self.mode == AMM_CLF_NAME: self.tpot_kwargs["config_dict"] = self.tpot_kwargs.get( "config_dict", TPOT_CLASSIFIER_CONFIG) if "scoring" not in self.tpot_kwargs: self.tpot_kwargs["scoring"] = "balanced_accuracy" self._backend = TPOTClassifier(**self.tpot_kwargs) elif self.mode == AMM_REG_NAME: self.tpot_kwargs["config_dict"] = self.tpot_kwargs.get( "config_dict", TPOT_REGRESSOR_CONFIG) if "scoring" not in self.tpot_kwargs: self.tpot_kwargs["scoring"] = "neg_mean_absolute_error" self._backend = TPOTRegressor(**self.tpot_kwargs) else: raise ValueError("Learning type {} not recognized as a valid mode " "for {}".format(self.mode, self.__class__.__name__)) self._features = df.drop(columns=target).columns.tolist() self._fitted_target = target self._backend = self._backend.fit(X, y, **fit_kwargs) return self
def classification(): digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py')
def tpot_optimization_clf(count, train_path, test_path, verbose=False): """ Optimize algorithms and parameters using TPOT for Classification trees. :param count: int, number of samples to be generated. :param train_path: string, path to the dataset used for training. :param test_path: string, path to the dataset used for testing. :param verbose: bool, representing if information regarding the process should be displayed. """ # Generate samples. if verbose: print("Get train samples. ") X_train, Y_train = Sampler.generate_samples(dataset=train_path, count=count) if verbose: print("Get test samples. ") X_test, Y_test = Sampler.generate_samples(dataset=test_path, count=count) tpot_config = { 'xgboost.XGBClassifier': { 'max_depth': [2, 3, 4, 5], "learning_rate": [0.02, 0.05, 0.1, 0.15, 0.2], 'n_estimators': [10, 20, 30, 40, 50, 100, 500], 'objective': ["reg:linear", "multi:softmax", "multi:softprob"], 'booster': ["gbtree", "gblinear", "dart"], 'n_jobs': [-1] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 20, 30, 40, 50, 100, 500], 'criterion': ["gini", "entropy"], 'max_features': ["auto", "sqrt", "log2"], 'max_depth': [2, 3, 4, 5], 'n_jobs': [-1] } } if verbose: print("Start TPOT optimization. ") tpot = TPOTClassifier(generations=10, population_size=30, verbosity=2, config_dict=tpot_config) tpot.fit(np.array(X_train), np.array(Y_train)) print( tpot.score(np.array(X_test, dtype=np.float64), np.array(Y_test, dtype=np.float64))) tpot.export('tpot_pipeline_clf.py')
def test_export_pipeline(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline""" tpot_obj = TPOTClassifier() pipeline_string = ( 'KNeighborsClassifier(CombineDFs(' 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5),SelectKBest(input_matrix, SelectKBest__k=20)' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform') pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np from copy import copy from sklearn.ensemble import VotingClassifier from sklearn.feature_selection import SelectKBest, f_classif from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \\ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( make_union( make_union(VotingClassifier([('branch', DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) )]), FunctionTransformer(copy)), SelectKBest(score_func=f_classif, k=20) ), KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") ) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
test_operator_key_2 = 'sklearn.feature_selection.SelectFromModel' TPOTSelectPercentile, TPOTSelectPercentile_args = TPOTOperatorClassFactory( test_operator_key_1, classifier_config_dict[test_operator_key_1] ) TPOTSelectFromModel, TPOTSelectFromModel_args = TPOTOperatorClassFactory( test_operator_key_2, classifier_config_dict[test_operator_key_2] ) mnist_data = load_digits() training_features, testing_features, training_target, testing_target = \ train_test_split(mnist_data.data.astype(np.float64), mnist_data.target.astype(np.float64), random_state=42) tpot_obj = TPOTClassifier() tpot_obj._fit_init() tpot_obj_reg = TPOTRegressor() tpot_obj_reg._fit_init() def test_export_random_ind(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39.""" tpot_obj = TPOTClassifier(random_state=39, config_dict="TPOT light") tpot_obj._fit_init() tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB
def test_set_params(): """Assert that set_params returns a reference to the TPOT instance""" tpot_obj = TPOTClassifier() assert tpot_obj.set_params() is tpot_obj
def test_set_params_2(): """Assert that set_params updates TPOT's instance variables""" tpot_obj = TPOTClassifier(generations=2) tpot_obj.set_params(generations=3) assert tpot_obj.generations == 3
from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size = 0.75, test_size = 0.25) tpot = TPOTClassifier(generations = 5, population_size = 20, verbosity = 2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py')
def generate_model(generations, train_X, train_y): tpot_generator = TPOTClassifier(generations=generations, verbosity=2) tpot_generator.fit(train_X, train_y) tpot_generator.export('tpot_model' + generations + '.py')