def testTPOTOperatorClassFactory(): """Assert that the TPOT operators class factory""" test_config_dict = { 'sklearn.svm.LinearSVC': { 'penalty': ["l1", "l2"], 'loss': ["hinge", "squared_hinge"], 'dual': [True, False], 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.] }, 'sklearn.linear_model.LogisticRegression': { 'penalty': ["l1", "l2"], 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.], 'dual': [True, False] }, 'sklearn.preprocessing.Binarizer': { 'threshold': np.arange(0.0, 1.01, 0.05) } } tpot_operator_list = [] tpot_argument_list = [] for key in sorted(test_config_dict.keys()): op, args = TPOTOperatorClassFactory(key, test_config_dict[key]) tpot_operator_list.append(op) tpot_argument_list += args assert len(tpot_operator_list) == 3 assert len(tpot_argument_list) == 9 assert tpot_operator_list[0].root == True assert tpot_operator_list[1].root == False assert tpot_operator_list[2].type() == "Classifier or Regressor" assert tpot_argument_list[1].values == [True, False]
from tpot import TPOTClassifier, TPOTRegressor from tpot.export_utils import export_pipeline, generate_import_code, _indent, \ generate_pipeline_code, get_by_name, set_param_recursive from tpot.operator_utils import TPOTOperatorClassFactory from tpot.config.classifier import classifier_config_dict from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from deap import creator from nose.tools import assert_raises, assert_equal, nottest train_test_split = nottest(train_test_split) test_operator_key_1 = 'sklearn.feature_selection.SelectPercentile' test_operator_key_2 = 'sklearn.feature_selection.SelectFromModel' TPOTSelectPercentile, TPOTSelectPercentile_args = TPOTOperatorClassFactory( test_operator_key_1, classifier_config_dict[test_operator_key_1] ) TPOTSelectFromModel, TPOTSelectFromModel_args = TPOTOperatorClassFactory( test_operator_key_2, classifier_config_dict[test_operator_key_2] ) digits_data = load_digits() training_features, testing_features, training_target, testing_target = \ train_test_split(digits_data.data.astype(np.float64), digits_data.target.astype(np.float64), random_state=42) tpot_obj = TPOTClassifier() tpot_obj._fit_init() tpot_obj_reg = TPOTRegressor()
def _fit_init(self): # initialization for fit function if not self.warm_start or not hasattr(self, '_pareto_front'): self._pop = [] self._pareto_front = None self._last_optimized_pareto_front = None self._last_optimized_pareto_front_n_gens = 0 self._optimized_pipeline = None self._optimized_pipeline_score = None self._exported_pipeline_text = "" self.fitted_pipeline_ = None self._fitted_imputer = None self._imputed = False self._memory = None # initial Memory setting for sklearn pipeline # dont save periodic pipelines more often than this self._output_best_pipeline_period_seconds = 30 # Try crossover and mutation at most this many times for # any one given individual (or pair of individuals) self._max_mut_loops = 50 self._setup_config(self.config_dict) self.operators = [] self.arguments = [] for key in sorted(self._config_dict.keys()): op_class, arg_types = TPOTOperatorClassFactory( key, self._config_dict[key], BaseClass=Operator, ArgBaseClass=ARGType ) if op_class: self.operators.append(op_class) self.arguments += arg_types # Schedule TPOT to run for many generations if the user specifies a # run-time limit TPOT will automatically interrupt itself when the timer # runs out if self.max_time_mins is not None: self.generations = 1000000 # Prompt the user if their version is out of date if not self.disable_update_check: update_check('tpot', __version__) if self.mutation_rate + self.crossover_rate > 1: raise ValueError( 'The sum of the crossover and mutation probabilities must be <= 1.0.' ) self.operators_context = { 'make_pipeline': make_pipeline, 'make_union': make_union, 'StackingEstimator': StackingEstimator, 'FunctionTransformer': FunctionTransformer, 'copy': copy } self._pbar = None # Specifies where to output the progress messages (default: sys.stdout). # Maybe open this API in future version of TPOT.(io.TextIOWrapper or io.StringIO) self._file = sys.stdout # Dictionary of individuals that have already been evaluated in previous # generations self.evaluated_individuals_ = {} self._setup_scoring_function(self.scoring) if self.subsample <= 0.0 or self.subsample > 1.0: raise ValueError( 'The subsample ratio of the training instance must be in the range (0.0, 1.0].' ) if self.n_jobs == -1: self._n_jobs = cpu_count() else: self._n_jobs = self.n_jobs self._setup_pset() self._setup_toolbox() ## Additions to _fit_init # Initialise list to save the predictions and pipelines analysed by TPOT self.predictions = [] self.pipelines = [] self._exported_pipeline_text = [] # Save training sample on the TPOT Object self.features = None self.target = None self.evaluated_individuals = {} self.curr_generations = 0 self.log = {} # Add the Gaussian kernels so that they can be used by TPOT self.operators_context['RBF'] = eval('RBF') self.operators_context['Matern'] = eval('Matern') self.operators_context['RationalQuadratic'] = eval('RationalQuadratic') self.operators_context['ExpSineSquared'] = eval('ExpSineSquared') self.operators_context['DotProduct'] = eval('DotProduct') self.operators_context['ConstantKernel'] = eval('ConstantKernel')
# Set up the MNIST data set for testing mnist_data = load_digits() training_features, testing_features, training_classes, testing_classes = \ train_test_split(mnist_data.data.astype(np.float64), mnist_data.target.astype(np.float64), random_state=42) # Set up the Boston data set for testing boston_data = load_boston() training_features_r, testing_features_r, training_classes_r, testing_classes_r = \ train_test_split(boston_data.data, boston_data.target, random_state=42) np.random.seed(42) random.seed(42) test_operator_key = 'sklearn.feature_selection.SelectKBest' TPOTSelectKBest, TPOTSelectKBest_args = TPOTOperatorClassFactory( test_operator_key, classifier_config_dict[test_operator_key]) def test_driver(): """Assert that the TPOT driver output normal result""" batcmd = "python -m tpot.driver tests.csv -is , -target class -g 2 -p 2 -os 4 -cv 5 -s 45 -v 1" ret_stdout = subprocess.check_output(batcmd, shell=True) try: ret_val = float( ret_stdout.decode('UTF-8').split('\n')[-2].split(': ')[-1]) except: ret_val = -float('inf') assert ret_val > 0.0 def test_init_custom_parameters():
from os import remove, path from tpot import TPOTClassifier, TPOTRegressor from tpot.export_utils import export_pipeline, generate_import_code, _indent, generate_pipeline_code, get_by_name from tpot.operator_utils import TPOTOperatorClassFactory from tpot.config.classifier import classifier_config_dict from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from deap import creator from nose.tools import assert_raises, assert_equal test_operator_key = 'sklearn.feature_selection.SelectPercentile' TPOTSelectPercentile, TPOTSelectPercentile_args = TPOTOperatorClassFactory( test_operator_key, classifier_config_dict[test_operator_key]) mnist_data = load_digits() training_features, testing_features, training_target, testing_target = \ train_test_split(mnist_data.data.astype(np.float64), mnist_data.target.astype(np.float64), random_state=42) def test_export_random_ind(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39.""" tpot_obj = TPOTClassifier(random_state=39) tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split
def rebuild_me(self): for a in NON_SERIALIZABLE: if not hasattr(self, a): setattr(self, a, []) self._setup_config(self.config_dict) self._setup_template(self.template) self.verbosity = 0 self.warm_start = True for key in sorted(self._config_dict.keys()): op_class, arg_types = TPOTOperatorClassFactory( key, self._config_dict[key], BaseClass=Operator, ArgBaseClass=ARGType, verbose=self.verbosity, ) if op_class: self.operators.append(op_class) self.arguments += arg_types self.operators_context = { "make_pipeline": make_pipeline, "make_union": make_union, "StackingEstimator": StackingEstimator, "FunctionTransformer": FunctionTransformer, "copy": copy, } setattr( self, '_pareto_front', tools.ParetoFront(similar=lambda ind1, ind2: np.allclose( ind1.fitness.values, ind2.fitness.values))) self._pbar = None self._setup_pset() self._setup_toolbox() self._pop = self._toolbox.population(n=self.population_size) if hasattr(self, 'pareto_front_fitted_pipelines_') and isinstance( self.pareto_front_fitted_pipelines_, dict): items = [ creator.Individual(PrimitiveTree([]).from_string(i, self._pset)) for i in self.pareto_front_fitted_pipelines_.keys() ] keys = [(lambda d: creator.FitnessMulti( (d.get('operator_count', 0), d.get('internal_cv_score', 0))))( self.evaluated_individuals_.get(k, {})) for k in self.pareto_front_fitted_pipelines_.keys()] elif hasattr(self, 'evaluated_individuals_') and isinstance( self.evaluated_individuals_, dict): items = [ creator.Individual(PrimitiveTree([]).from_string(i, self._pset)) for i in self.evaluated_individuals_.keys() ] keys = [ creator.FitnessMulti((d.get('operator_count', 0), d.get('internal_cv_score', 0))) for d in self.evaluated_individuals_.values() ] else: self.warm_start = False self.verbosity = 3 return items = add_attr(keys, items) setattr(self._pareto_front, 'items', items) setattr(self._pareto_front, 'keys', sorted(keys)) if hasattr(self, '_optimized_pipeline') and isinstance( self._optimized_pipeline, str): optimized_pipeline = creator.Individual( PrimitiveTree([]).from_string(self._optimized_pipeline, self._pset)) optimized_pipeline.__str__ = partial(PrimitiveTree.__str__, optimized_pipeline) keys = [ creator.FitnessMulti((d.get('operator_count', 0), d.get('internal_cv_score', 0))) for k, d in self.evaluated_individuals_.items() if k == optimized_pipeline.__str__() ] if len(keys) > 0: optimized_pipeline.fitness = keys[0] else: optimized_pipeline.fitness.values = (5000.0, -float("inf")) self._optimized_pipeline = optimized_pipeline setattr(self, '_last_optimized_pareto_front', [v for i in self._pareto_front.keys for v in i.values[-1:]]) if not hasattr(self, '_last_optimized_pareto_front_n_gens'): if hasattr(self, 'evaluated_individuals_'): last_gen = max([ d.get('generation') for d in list(self.evaluated_individuals_.values()) ]) else: last_gen = 0 setattr(self, '_last_optimized_pareto_front_n_gens', last_gen) else: last_gen = self._last_optimized_pareto_front_n_gens if not hasattr(self, 'evaluated_individuals_'): setattr( self, 'evaluated_individuals_', { p.__str__(): (lambda v: { 'generation': last_gen, 'mutation_count': 0, 'crossover_count': 0, 'predecessor': ('ROOT', ), 'operator_count': v[0], 'internal_cv_score': v[-1] })(self._pareto_front.keys[i].values) for i, p in enumerate(self._pareto_front.items) }) self.verbosity = 3 return self