def test_generate_pipeline_code(): """Assert that generate_pipeline_code() returns the correct code given a specific pipeline""" pipeline = [ 'KNeighborsClassifier', [ 'CombineDFs', ['GradientBoostingClassifier', 'input_matrix', 38.0, 0.87, 0.5], ['GaussianNB', ['ZeroCount', 'input_matrix']] ], 18, 33 ] expected_code = """make_pipeline( make_union( make_union(VotingClassifier(estimators=[('branch', GradientBoostingClassifier(learning_rate=1.0, max_features=1.0, min_weight_fraction_leaf=0.5, n_estimators=500) )]), FunctionTransformer(lambda X: X)), make_union(VotingClassifier(estimators=[('branch', make_pipeline( ZeroCount(), GaussianNB() ) )]), FunctionTransformer(lambda X: X)) ), KNeighborsClassifier(n_neighbors=5, weights="distance") )""" assert expected_code == generate_pipeline_code(pipeline)
def test_generate_pipeline_code(): """Assert that generate_pipeline_code() returns the correct code given a specific pipeline""" pipeline = ['KNeighborsClassifier', ['CombineDFs', ['GradientBoostingClassifier', 'input_matrix', 38.0, 0.87, 0.5], ['GaussianNB', ['ZeroCount', 'input_matrix']]], 18, 33] expected_code = """make_pipeline( make_union( make_union(VotingClassifier(estimators=[('branch', GradientBoostingClassifier(learning_rate=1.0, max_features=1.0, min_weight_fraction_leaf=0.5, n_estimators=500) )]), FunctionTransformer(lambda X: X)), make_union(VotingClassifier(estimators=[('branch', make_pipeline( ZeroCount(), GaussianNB() ) )]), FunctionTransformer(lambda X: X)) ), KNeighborsClassifier(n_neighbors=5, weights="distance") )""" assert expected_code == generate_pipeline_code(pipeline)
def test_generate_pipeline_code_2(): """Assert that generate_pipeline_code() returns the correct code given a specific pipeline with two CombineDFs.""" pipeline = [ 'KNeighborsClassifier', [ 'CombineDFs', [ 'GradientBoostingClassifier', 'input_matrix', 38.0, 5, 5, 5, 0.05, 0.5 ], [ 'CombineDFs', ['MinMaxScaler', 'input_matrix'], ['ZeroCount', ['MaxAbsScaler', 'input_matrix']] ] ], 18, 'uniform', 2 ] expected_code = """make_pipeline( make_union( StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=38.0, max_depth=5, max_features=5, min_samples_leaf=5, min_samples_split=0.05, n_estimators=0.5)), make_union( MinMaxScaler(), make_pipeline( MaxAbsScaler(), ZeroCount() ) ) ), KNeighborsClassifier(n_neighbors=18, p="uniform", weights=2) )""" assert expected_code == generate_pipeline_code(pipeline, tpot_obj.operators)
def test_generate_pipeline_code(): """Assert that generate_pipeline_code() returns the correct code given a specific pipeline""" tpot_obj = TPOTClassifier() pipeline = [ 'KNeighborsClassifier', [ 'CombineDFs', [ 'GradientBoostingClassifier', 'input_matrix', 38.0, 5, 5, 5, 0.05, 0.5 ], ['GaussianNB', ['ZeroCount', 'input_matrix']] ], 18, 'uniform', 2 ] expected_code = """make_pipeline( make_union( make_union(VotingClassifier([('branch', GradientBoostingClassifier(learning_rate=38.0, max_depth=5, max_features=5, min_samples_leaf=5, min_samples_split=0.05, n_estimators=0.5) )]), FunctionTransformer(copy)), make_union(VotingClassifier([('branch', make_pipeline( ZeroCount(), GaussianNB() ) )]), FunctionTransformer(copy)) ), KNeighborsClassifier(n_neighbors=18, p="uniform", weights=2) )""" assert expected_code == generate_pipeline_code(pipeline, tpot_obj.operators)
def test_generate_pipeline_code(): """Assert that generate_pipeline_code() returns the correct code given a specific pipeline""" tpot_obj = TPOTClassifier() pipeline = ['KNeighborsClassifier', ['CombineDFs', ['GradientBoostingClassifier', 'input_matrix', 38.0, 5, 5, 5, 0.05, 0.5], ['GaussianNB', ['ZeroCount', 'input_matrix']]], 18, 'uniform', 2] expected_code = """make_pipeline( make_union( make_union(VotingClassifier([('branch', GradientBoostingClassifier(learning_rate=38.0, max_depth=5, max_features=5, min_samples_leaf=5, min_samples_split=0.05, n_estimators=0.5) )]), FunctionTransformer(copy)), make_union(VotingClassifier([('branch', make_pipeline( ZeroCount(), GaussianNB() ) )]), FunctionTransformer(copy)) ), KNeighborsClassifier(n_neighbors=18, p="uniform", weights=2) )""" assert expected_code == generate_pipeline_code(pipeline, tpot_obj.operators)
def test_generate_pipeline_code_2(): """Assert that generate_pipeline_code() returns the correct code given a specific pipeline with two CombineDFs.""" pipeline = [ 'KNeighborsClassifier', [ 'CombineDFs', [ 'GradientBoostingClassifier', 'input_matrix', 38.0, 5, 5, 5, 0.05, 0.5], [ 'CombineDFs', [ 'MinMaxScaler', 'input_matrix' ], ['ZeroCount', [ 'MaxAbsScaler', 'input_matrix' ] ] ] ], 18, 'uniform', 2 ] expected_code = """make_pipeline( make_union( StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=38.0, max_depth=5, max_features=5, min_samples_leaf=5, min_samples_split=0.05, n_estimators=0.5)), make_union( MinMaxScaler(), make_pipeline( MaxAbsScaler(), ZeroCount() ) ) ), KNeighborsClassifier(n_neighbors=18, p="uniform", weights=2) )""" assert expected_code == generate_pipeline_code(pipeline, tpot_obj.operators)
def _save_periodic_pipeline(self, gen): try: #self._create_periodic_checkpoint_folder() for pipeline, pipeline_scores in zip(self._pareto_front.items, reversed(self._pareto_front.keys)): idx = self._pareto_front.items.index(pipeline) pareto_front_pipeline_score = pipeline_scores.wvalues[1] sklearn_pipeline_str = generate_pipeline_code(expr_to_tree(pipeline, self._pset), self.operators) to_write = export_pipeline(pipeline, self.operators, self._pset, self._imputed, pareto_front_pipeline_score, self.random_state) # fit the pipeline again and get the test score sklearn_pipeline = self._toolbox.compile(expr=pipeline) sklearn_pipeline.fit(self.features, self.target) ypredict = sklearn_pipeline.predict(self.features_test) mae = - mean_absolute_error(self.target_test, ypredict) # dont export a pipeline you had if self._exported_pipeline_text.count(sklearn_pipeline_str): self._update_pbar(pbar_num=0, pbar_msg='Periodic pipeline was not saved, probably saved before...') else: filename = os.path.join(self.periodic_checkpoint_folder, 'pipeline_gen_{}_idx_{}_{}.py'.format(gen, idx, datetime.now().strftime('%Y.%m.%d_%H-%M-%S'))) self._update_pbar(pbar_num=0, pbar_msg='Saving periodic pipeline from pareto front to {}'.format(filename)) with open(filename, 'w') as output_file: output_file.write(to_write) self._exported_pipeline_text.append(sklearn_pipeline_str) # dump a pickle with current pareto value and the pipeline, it is not yet saved self.log[gen] = {} self.log[gen]['pipeline_name'] = sklearn_pipeline_str self.log[gen]['pipeline_score'] = pipeline_scores.wvalues[1] self.log[gen]['pipeline_test_mae'] = mae self.log[gen]['pipeline_sklearn_obj'] = self._compile_to_sklearn(pipeline) # This can ge used to the pipeline complexity self.log[gen]['pipeline_tree'] = expr_to_tree(pipeline, self._pset) except Exception as e: self._update_pbar(pbar_num=0, pbar_msg='Failed saving periodic pipeline, exception:\n{}'.format(str(e)[:250]))
def createsklearnPipeline(pipeline_optimizer, pipes): # generate operator list pp_operators = [] for k, v in pipeline_optimizer.operators_context.items(): if 'sklearn.preprocessing' in str(v) or \ 'sklearn.decomposition' in str(v) or \ 'tpot.builtins' in str(v) or \ 'sklearn.cluster' in str(v) or \ 'sklearn.feature_selection' in str(v): pp_operators.append(k.lower()) else: pass pp_operators.remove( 'stackingestimator') # remove stacking estimator from operators n = 1 + pipes p = {} plist = [] for pipeline_string, attrib in sorted( pipeline_optimizer.evaluated_individuals_.items()): # convert pipeline string to scikit-learn pipeline object deap_pipeline = creator.Individual.from_string( pipeline_string, pipeline_optimizer._pset) sklearn_pipeline = pipeline_optimizer._toolbox.compile( expr=deap_pipeline) # print sklearn pipeline string sklearn_pipeline_str = generate_pipeline_code( expr_to_tree(deap_pipeline, pipeline_optimizer._pset), pipeline_optimizer.operators) #print(n, sklearn_pipeline.steps) if attrib.get('internal_cv_score') > 0: # handle bad data in cv_score cv_score = attrib.get('internal_cv_score') else: cv_score = abs(attrib.get('internal_cv_score') ) # change this from None to abs for Regression for num, l in enumerate(sklearn_pipeline.steps): if l[0] not in 'featureunion': # ignore feature union for now #print(n, sklearn_pipeline.steps[num][1]) params = sklearn_pipeline.steps[num][1].get_params() if 'stackingestimator' in l[0]: # identify stacking estimator stack = 'Y' algoName = str( params['estimator']).split('(')[0].lower() + '_stack' params = params['estimator'].get_params() else: stack = 'N' algoName = l[0] #pp_operators = ppoperator(pipeline_optimizer) # identify preprocessing algos if l[0].startswith(tuple(pp_operators)): pp_flag = 'Y' params = l[1].get_params() if l[0] in ['selectfrommodel', 'rfe']: algoName = l[0] params = params['estimator'].get_params() else: algoName = str(l[1]).split('(')[0].lower() params = l[1].get_params() else: pp_flag = 'N' p = { "PIPELINE": n, "ALGO_NAME": algoName, "STACK_FLG": stack, "PP_FLAG": pp_flag, "SCORE": cv_score } p.update(params) plist.append(p) n = n + 1 # update pipeline number master = pd.DataFrame() for i in plist: pip = int(i['PIPELINE']) alg = i['ALGO_NAME'] algtype = i['STACK_FLG'] score = i['SCORE'] ppflag = i['PP_FLAG'] pipeList = [] aList = [] atypeList = [] hList = [] vList = [] sList = [] ppList = [] htypeList = [] for k, v in i.items(): if k not in [ 'PIPELINE', 'ALGO_NAME', 'SCORE', 'STACK_FLG', 'PP_FLAG' ]: pipeList.append(pip) aList.append(alg) atypeList.append(algtype) hList.append(k) if type(v) in [bool, str]: # check hyper value type htype = 'C' else: htype = 'N' vList.append(v) htypeList.append(htype) sList.append(score) ppList.append(ppflag) df_dict = { 'PIPELINE': pipeList, 'ALGO_NAME': aList, 'STACK_FLG': atypeList, 'PP_FLAG': ppList, 'SCORE': sList, "HYPER_NAME": hList, "HYPER_TYPE": htypeList, "HYPER_VALUE": vList } df = pd.DataFrame(df_dict) master = master.append(df) #stack_df = master[master['STACK_FLG']=='Y'].drop_duplicates() # drop bad pipelines ''' if type(pipeline_optimizer) == 'TPOTRegressor': master.drop(master[master['SCORE'] > master['SCORE'].std()*4].index, inplace=True) ''' # check if file exists if path.exists('pipeline.csv'): master.to_csv('pipeline.csv', mode='a', header=False, index=False) else: master.to_csv('pipeline.csv', index=False)