def test_export_random_ind(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39.""" tpot_obj = TPOTClassifier(random_state=39) tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=42) exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=65), DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=4, min_samples_split=18) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ print(export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)) assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def test_pipeline_score_save(): """Assert that the TPOTClassifier can generate a scored pipeline export correctly.""" tpot_obj = TPOTClassifier() tpot_obj._fit_init() tpot_obj._pbar = tqdm(total=1, disable=True) pipeline_string = ( 'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),' 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.929813743 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=20), DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert_equal(expected_code, export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, pipeline_score=0.929813743))
def test_export_pipeline_5(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with SelectFromModel.""" pipeline_string = ( 'DecisionTreeRegressor(SelectFromModel(input_matrix, ' 'SelectFromModel__ExtraTreesRegressor__max_features=0.05, SelectFromModel__ExtraTreesRegressor__n_estimators=100, ' 'SelectFromModel__threshold=0.05), DecisionTreeRegressor__max_depth=8,' 'DecisionTreeRegressor__min_samples_leaf=5, DecisionTreeRegressor__min_samples_split=5)' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj_reg._pset) expected_code = """import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesRegressor from sklearn.feature_selection import SelectFromModel from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeRegressor # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = make_pipeline( SelectFromModel(estimator=ExtraTreesRegressor(max_features=0.05, n_estimators=100), threshold=0.05), DecisionTreeRegressor(max_depth=8, min_samples_leaf=5, min_samples_split=5) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj_reg.operators, tpot_obj_reg._pset)
def test_export_pipeline_3(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with a preprocessor.""" pipeline_string = ( 'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),' 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=20), DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def test_export_pipeline_2(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline (only one classifier).""" pipeline_string = ( 'KNeighborsClassifier(' 'input_matrix, ' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1, ' 'KNeighborsClassifier__weights=uniform' ')' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def test_export_pipeline_6(): """Assert that exported_pipeline() generated a compile source file with random_state and data_file_path.""" pipeline_string = ( 'KNeighborsClassifier(' 'input_matrix, ' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1, ' 'KNeighborsClassifier__weights=uniform' ')' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('test_path', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=42) exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, random_state=42, data_file_path='test_path')
def test_export_pipeline_3(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with a preprocessor""" tpot_obj = TPOTClassifier() pipeline_string = ( 'DecisionTreeClassifier(SelectKBest(input_matrix, SelectKBest__k=20),' 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np from sklearn.feature_selection import SelectKBest, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \\ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( SelectKBest(score_func=f_classif, k=20), DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) ) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def test_export_pipeline_2(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline (only one classifier)""" tpot_obj = TPOTClassifier() pipeline_string = ( 'KNeighborsClassifier(input_matrix, KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform)') pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \\ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def test_random_ind_2(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 45""" tpot_obj = TPOTClassifier(random_state=45) tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from tpot.built_in_operators import ZeroCount # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \\ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( ZeroCount(), LogisticRegression(C=0.0001, dual=False, penalty="l2") ) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def test_export_random_ind(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39.""" tpot_obj = TPOTClassifier(random_state=39, config_dict="TPOT light") tpot_obj._fit_init() tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=39) exported_pipeline = BernoulliNB(alpha=1.0, fit_prior=False) # Fix random state in exported estimator if hasattr(exported_pipeline, 'random_state'): setattr(exported_pipeline, 'random_state', 39) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ exported_code = export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, random_state=tpot_obj.random_state) assert expected_code == exported_code
def test_imputer_in_export(): """Assert that TPOT exports a pipeline with an imputation step if imputation was used in fit().""" tpot_obj = TPOTClassifier( random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, config_dict='TPOT light' ) features_with_nan = np.copy(training_features) features_with_nan[0][0] = float('nan') tpot_obj.fit(features_with_nan, training_target) # use fixed pipeline since the random.seed() performs differently in python 2.* and 3.* pipeline_string = ( 'KNeighborsClassifier(' 'input_matrix, ' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1, ' 'KNeighborsClassifier__weights=uniform' ')' ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) export_code = export_pipeline(tpot_obj._optimized_pipeline, tpot_obj.operators, tpot_obj._pset, tpot_obj._imputed) expected_code = """import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier try: from sklearn.impute import SimpleImputer as Imputer except ImportError: from sklearn.preprocessing import Imputer # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=None) imputer = Imputer(strategy="median") imputer.fit(training_features) training_features = imputer.transform(training_features) testing_features = imputer.transform(testing_features) exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert_equal(export_code, expected_code)
def test_export_pipeline(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline.""" pipeline_string = ( 'KNeighborsClassifier(CombineDFs(' 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5),SelectPercentile(input_matrix, SelectPercentile__percentile=20))' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform') pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.tree import DecisionTreeClassifier from tpot.builtins import StackingEstimator # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = make_pipeline( make_union( StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)), SelectPercentile(score_func=f_classif, percentile=20) ), KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ print(export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)) assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def test_export_pipeline(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline""" tpot_obj = TPOTClassifier() pipeline_string = ( 'KNeighborsClassifier(CombineDFs(' 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5),SelectKBest(input_matrix, SelectKBest__k=20)' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform') pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np from copy import copy from sklearn.ensemble import VotingClassifier from sklearn.feature_selection import SelectKBest, f_classif from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \\ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( make_union( make_union(VotingClassifier([('branch', DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) )]), FunctionTransformer(copy)), SelectKBest(score_func=f_classif, k=20) ), KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") ) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def _save_periodic_pipeline(self, gen): try: #self._create_periodic_checkpoint_folder() for pipeline, pipeline_scores in zip(self._pareto_front.items, reversed(self._pareto_front.keys)): idx = self._pareto_front.items.index(pipeline) pareto_front_pipeline_score = pipeline_scores.wvalues[1] sklearn_pipeline_str = generate_pipeline_code(expr_to_tree(pipeline, self._pset), self.operators) to_write = export_pipeline(pipeline, self.operators, self._pset, self._imputed, pareto_front_pipeline_score, self.random_state) # fit the pipeline again and get the test score sklearn_pipeline = self._toolbox.compile(expr=pipeline) sklearn_pipeline.fit(self.features, self.target) ypredict = sklearn_pipeline.predict(self.features_test) mae = - mean_absolute_error(self.target_test, ypredict) # dont export a pipeline you had if self._exported_pipeline_text.count(sklearn_pipeline_str): self._update_pbar(pbar_num=0, pbar_msg='Periodic pipeline was not saved, probably saved before...') else: filename = os.path.join(self.periodic_checkpoint_folder, 'pipeline_gen_{}_idx_{}_{}.py'.format(gen, idx, datetime.now().strftime('%Y.%m.%d_%H-%M-%S'))) self._update_pbar(pbar_num=0, pbar_msg='Saving periodic pipeline from pareto front to {}'.format(filename)) with open(filename, 'w') as output_file: output_file.write(to_write) self._exported_pipeline_text.append(sklearn_pipeline_str) # dump a pickle with current pareto value and the pipeline, it is not yet saved self.log[gen] = {} self.log[gen]['pipeline_name'] = sklearn_pipeline_str self.log[gen]['pipeline_score'] = pipeline_scores.wvalues[1] self.log[gen]['pipeline_test_mae'] = mae self.log[gen]['pipeline_sklearn_obj'] = self._compile_to_sklearn(pipeline) # This can ge used to the pipeline complexity self.log[gen]['pipeline_tree'] = expr_to_tree(pipeline, self._pset) except Exception as e: self._update_pbar(pbar_num=0, pbar_msg='Failed saving periodic pipeline, exception:\n{}'.format(str(e)[:250]))
def test_export_pipeline_4(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with input_matrix in CombineDFs.""" tpot_obj = TPOTClassifier() pipeline_string = ( 'KNeighborsClassifier(CombineDFs(' 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5),input_matrix)' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform') pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.tree import DecisionTreeClassifier from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=42) exported_pipeline = make_pipeline( make_union( StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)), FunctionTransformer(copy) ), KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def test_export_pipeline_6(): """Assert that exported_pipeline() generated a compile source file with random_state and data_file_path.""" pipeline_string = ( 'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),' 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('test_path', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=20), DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ exported_code = export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, random_state=42, data_file_path='test_path') assert expected_code == exported_code
def test_export_pipeline(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline""" tpot_obj = TPOT() pipeline = creator.Individual.\ from_string("KNeighborsClassifier(CombineDFs(GradientBoostingClassifier(input_matrix, 38.0, 0.87, 0.5), RFE(input_matrix, 0.17999999999999999)), 18, 33)", tpot_obj._pset) expected_code = """import numpy as np from sklearn.cross_validation import train_test_split from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier from sklearn.feature_selection import RFE from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer from sklearn.svm import SVC # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \\ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( make_union( make_union(VotingClassifier(estimators=[('branch', GradientBoostingClassifier(learning_rate=1.0, max_features=1.0, min_weight_fraction_leaf=0.5, n_estimators=500) )]), FunctionTransformer(lambda X: X)), RFE(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001, verbose=False), step=0.18) ), KNeighborsClassifier(n_neighbors=5, weights="distance") ) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline)
def test_export_pipeline(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline""" tpot_obj = TPOTClassifier() pipeline = creator.Individual.\ from_string("KNeighborsClassifier(CombineDFs(GradientBoostingClassifier(input_matrix, 38.0, 0.87), SelectKBest(input_matrix, 5)), 18, 33)", tpot_obj._pset) expected_code = """import numpy as np from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier from sklearn.feature_selection import SelectKBest, f_classif from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \\ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( make_union( make_union(VotingClassifier([('branch', GradientBoostingClassifier(learning_rate=1.0, max_features=1.0, n_estimators=500) )]), FunctionTransformer(lambda X: X)), SelectKBest(k=5, score_func=f_classif) ), KNeighborsClassifier(n_neighbors=5, weights="distance") ) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline)