def tpot_fit(events, X_all, y_all, num_threads): inner_cv = PurgedKFold( n_splits=5, t1=events["t1"], pct_embargo=0, random_state=42, ) clf = tpot.TPOTClassifier( generations=500, population_size=60, offspring_size=None, # mutation_rate=0.9, # crossover_rate=0.1, scoring='neg_log_loss', cv=inner_cv, # subsample=1.0, n_jobs=num_threads, max_time_mins=500, max_eval_time_mins=5, random_state=42, periodic_checkpoint_folder='checks', # use_dask=True, config_dict='TPOT light', verbosity=2, ) clf.fit(X_all, y_all) clf.export('tpot_pipeline.py')
def train( self, train_file: Union[str, Path], validation_file: Optional[Union[str, Path]] = None, workdir: Optional[Union[str, Path]] = None, ) -> Dict[str, float]: X_train, y_train = self.load_data(train_file) assert y_train is not None with tempfile.TemporaryDirectory() as tempdir: workdir = Path(workdir or tempdir) log_file_name = workdir / "tpot.log" pipeline_file_name = workdir / "fitted_pipeline.pkl" pipeline_code_file_name = workdir / "pipeline.py" with open(log_file_name, "w") as log_file: teeing_log_file = TeeingIO(log_file, sys.stdout) if self._task == "classification": model = tpot.TPOTClassifier( log_file=teeing_log_file, **self._kwargs, ) else: model = tpot.TPOTRegressor(log_file=teeing_log_file, **self._kwargs) model.fit(X_train, y_train) with open(log_file_name) as log_file: tpot_log = log_file.read() model.export(str(pipeline_code_file_name)) self._estimator = model.fitted_pipeline_ with open(pipeline_file_name, "wb") as pipeline_file: pickle.dump(self._estimator, pipeline_file) metrics = self._get_metrics_from_log(tpot_log) if validation_file is not None: X_val, y_val = self.load_data(validation_file) assert y_val is not None metrics["validation_score"] = model.score(X_val, y_val) return metrics
def demo(): """ DOCSTRING """ telescope = pandas.read_csv('data/MAGIC Gamma Telescope Data.csv') telescope_shuffle = telescope.iloc[numpy.random.permutation( len(telescope))] tele = telescope_shuffle.reset_index(drop=True) tele['Class'] = tele['Class'].map({'g': 0, 'h': 1}) tele_class = tele['Class'].values training_indices, testing_indices = sklearn.model_selection.train_test_split( tele.index, stratify=tele_class, train_size=0.75, test_size=0.25) validation_indices = testing_indices tpot = tpot.TPOTClassifier(generations=5, verbosity=2) tpot.fit( tele.drop('Class', axis=1).loc[training_indices].values, tele.loc[training_indices, 'Class'].values) tpot.score( tele.drop('Class', axis=1).loc[validation_indices].values, tele.loc[validation_indices, 'Class'].values) tpot.export('pipeline.py')
def get_robust_tpot( max_time_mins=5, scoring="f1_macro", cv=5, random_state=42, n_jobs=1, verbose=False, ): clf = RobustSearch( search_model=tpot.TPOTClassifier( scoring=scoring, cv=cv, n_jobs=n_jobs, max_time_mins=max_time_mins, random_state=random_state, verbosity=2, disable_update_check=True, ), verbose=verbose, ) return clf
def __init__(self, nrows=100, ncols=10, seed=42, max_time_mins=1): self.pipelines = [] print("Generating pipelines for tests") np.random.seed(seed) X, y = sklearn.datasets.make_classification(nrows, ncols) self.X = X self.y = y print("TPOT-generated pipelines") clf = tpot.TPOTClassifier( max_time_mins=max_time_mins, verbosity=3, random_state=seed, ) clf.fit(self.X, self.y) ga_pipelines = [ e["pipeline_obj"] for e in clf.evaluated_individuals_.values() ] assert len(ga_pipelines) > 0 self.pipelines.extend(ga_pipelines) print("Random-search-generated pipeliens") clf2 = RandomSearch( tpot.config.classifier_config_dict, 4, max_time_mins=max_time_mins, max_time_mins_per_pipeline=1, max_retries=100, cv=3, scoring="f1_macro", random_state=seed, ) clf2.fit(self.X, self.y) rand_pipelines = [ e["pipeline_obj"] for e in clf2.evaluated_individuals_.values() ] assert len(rand_pipelines) > 0 self.pipelines.extend(rand_pipelines) np.random.shuffle(self.pipelines)
def get_robust_tpot( config_dict=None, max_time_mins=5, scoring="f1_macro", cv=5, random_state=42, n_jobs=-1, check_point_folder=None, verbosity=2, ): clf = RobustSearch(search_model=tpot.TPOTClassifier( config_dict=config_dict, scoring=scoring, cv=cv, n_jobs=n_jobs, max_time_mins=max_time_mins, # max on a single timeline...otherwise can blow out # and end up with not a single pipeline fit max_eval_time_mins=MAX_TIME_MINS_PER_PIPELINE, random_state=random_state, verbosity=verbosity, disable_update_check=True, )) return clf
rf_estimator = ensemble.RandomForestClassifier() rf_grid = { 'max_depth': list(range(1, 9)), 'n_estimators': list(range(1, 300, 100)) } rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid, titanic_train1, y_train) X_train = utils.select_features(rf_final_estimator, titanic_train1, threshold='mean') tpot_estimator = tpot.TPOTClassifier( generations=10, population_size=40, verbosity=2, early_stop=3, random_state=100, cv=5, scoring='accuracy', periodic_checkpoint_folder='E:/checkpoint') tpot_estimator.fit(X_train, y_train) print(tpot_estimator.score(X_train, y_train)) print(tpot_estimator.fitted_pipeline_) print(tpot_estimator._optimized_pipeline) print(tpot_estimator.evaluated_individuals_) titanic_test1 = titanic[titanic_train.shape[0]:] X_test = utils.select_features(rf_final_estimator, titanic_test1, threshold='mean')
dir = 'C:/Users/akhram/Desktop/Algo/Problems/Classification/DontOverFitII' train = pd.read_csv(os.path.join(dir, 'train.csv')) print(train.info()) print(train.columns) sns.countplot(x='target',data=train) #filter unique value features train1 = train.iloc[:,2:] y = train['target'].astype(int) X_train, X_eval, y_train, y_eval = model_selection.train_test_split(train1, y, test_size=0.1, random_state=1) tpot_estimator = tpot.TPOTClassifier(generations=10, population_size=40, verbosity=2, early_stop=2, random_state=100, cv=5, scoring='roc_auc', config_dict=None, warm_start=True, periodic_checkpoint_folder='E:/checkpoint') tpot_estimator.fit(X_train, y_train) print(tpot_estimator.score(X_train, y_train)) print(tpot_estimator.evaluated_individuals_) print(tpot_estimator.fitted_pipeline_) print(tpot_estimator.score(X_eval, y_eval)) test = pd.read_csv(os.path.join(dir, 'test.csv')) print(test.info()) print(test.columns) test1 = test.iloc[:,1:]
config = eval(config_var_name) if template_str: print(">> TEMPLATE STRING: ", template_str) X_train, X_test, y_train, y_test = train_test_split(X, y.astype(np.float64), train_size=0.8, test_size=0.2) if conf_type == 'template': # USE TEMPLATE # (No stacking) clf_t = tpot.TPOTClassifier( generations=100, population_size=100, verbosity=2, config_dict=config_nn, # We can be permissive when template_str is set template=template_str) else: # USE CONFIG DICT # (Stacking allowed) clf_t = tpot.TPOTClassifier( generations=100, population_size=100, verbosity=2, config_dict=config, ) start_t = time.time() print(">> BEGIN TRAINING AT: {0:.2f}".format(start_t)) clf_t.fit(X_train, y_train)
def __init__(self, **kwargs): self.clf = tpot.TPOTClassifier(**kwargs)
X_sel_1, X_hof_1, y_sel_1, y_hof_1 =\ sklearn.model_selection.train_test_split(X_1, y_1, test_size = 0.2) X_0 = X[ind_0] y_0 = y[ind_0] X_sel_0, X_hof_0, y_sel_0, y_hof_0 =\ sklearn.model_selection.train_test_split(X_0, y_0, test_size = y_hof_1.shape[0]) X_sel_0_tr, X_sel_0_drop, y_sel_0_tr, y_sel_0_drop =\ sklearn.model_selection.train_test_split(X_sel_0, y_sel_0, train_size = y_sel_1.shape[0]) X_sel = numpy.concatenate((X_sel_0_tr, X_sel_1), axis=0) y_sel = numpy.concatenate((y_sel_0_tr, y_sel_1), axis=0) X_hof = numpy.concatenate((X_hof_0, X_hof_1), axis=0) y_hof = numpy.concatenate((y_hof_0, y_hof_1), axis=0) print(f"{cities.iloc[i, 0]}") print(f"{y_sel_1.shape[0]}/{y_sel.shape[0]}") print(f"{y_hof_1.shape[0]}/{y_hof.shape[0]}") log.write("{},{}/{},{}/{}\n".format(cities.iloc[i, 0], y_sel_1.shape[0], y_sel.shape[0], y_hof_1.shape[0], y_hof.shape[0])) clf = tpot.TPOTClassifier(n_jobs=job) clf.fit(X_sel, y_sel) max_score = clf.score(X_sel, y_sel) test_score = clf.score(X_hof, y_hof) print(f"{cities.iloc[i, 0]} {max_score} {test_score}") with open(f"tex/{cities.iloc[i, 0]}_tpot.tex", "w") as fout: fout.write( latex_template.format(cities.iloc[i, 0], max_score, test_score)) connection.close()
from sklearn import datasets from sklearn.model_selection import cross_val_predict from sklearn.metrics import f1_score import numpy as np import pandas as pd import tpot X,y = datasets.load_iris(return_X_y=True) #model = tpot.TPOTClassifier(generations=50, population_size=50, verbosity=3, config_dict='TPOT light') model = tpot.TPOTClassifier(generations=50, population_size=50, verbosity=3) model.fit(X, y) model.export('iris-tpot-result.py') pipe = model._toolbox.compile(expr=model._optimized_pipeline) cv_pred = cross_val_predict(pipe, X, y, cv=5) print("Score: %.4f" % f1_score(y, cv_pred, average='micro'))
def build_dicts(tpot_dict, loss_dict, train, targets, reload=False): tpot_mdr_classifier_config_dict = { # Classifiers 'mdr.MDRClassifier': { 'tie_break': [0, 1], 'default_label': [0, 1] }, # Feature Selectors 'skrebate.ReliefF': { 'n_features_to_select': range(1, 6), 'n_neighbors': [2, 10, 50, 100, 250, 500] }, 'skrebate.SURF': { 'n_features_to_select': range(1, 6) }, 'skrebate.SURFstar': { 'n_features_to_select': range(1, 6) }, 'skrebate.MultiSURF': { 'n_features_to_select': range(1, 6) } } train = train.copy() targets = targets.copy() for col in train.keys(): if not is_numeric_dtype(train[col]): train[col] = LabelEncoder().fit_transform(train[col]) for col in targets.keys(): if col == 'sig_id': continue if col in tpot_dict: print(f'\tAlready fitted {col} with loss {loss_dict[col]}') continue print(f'Fitting {col}...') t = time() inp = train.copy().drop(columns=['sig_id']) lbls = targets[col].copy() if lbls.sum() > 1: inp, lbls = repeat_sample(inp, lbls, 2) X_train, X_test, y_train, y_test = train_test_split(inp, lbls, stratify=lbls) clf = tpot.TPOTClassifier(generations=5, population_size=50, verbosity=3, warm_start=True) clf.fit(X_train, y_train) loss = clf.score(X_test, y_test) tpot_dict[col] = clf loss_dict[col] = loss clf.export(f'input/tpot/{col}') with open(f'input/tpot/loss_dict', 'wb+') as hand: pickle.dump(loss_dict, hand) print('{}\t\t{}\t\t{:.5f}\n'.format( str(datetime.timedelta(seconds=time() - t))[:7], col, loss)) total_loss = 0 for v in loss_dict.values(): total_loss += v print(f'Average loss: {total_loss / (len(targets.keys())-1)}') return tpot_dict, loss_dict
data = pd.read_csv('model2.csv', index_col=0) data.columns data.drop('UniqueCarrier', axis=1) #clean the data ( data preprocessing ) data_shuffle = data.iloc[np.random.permutation(len(data))] data = data_shuffle.reset_index( drop=True) ## it was tele before, is this valid naming # obtaining the class values data_class = data['Class'].values # Split data with 75% for training and 25% for testing trainig_indices, validation_indices = training_indices, testing_indices = train_test_split( data.index, stratify=data_class, train_size=0.75, test_size=0.25) # Genetic programming begins !!! tpot = tp.TPOTClassifier(generations=5, verbosity=2) # optimization runs for 5 generations tpot.fit( data.drop('Class', axis=1).loc[training_indices].values, data.loc[training_indices, 'Class'].values) # Score accuracy tpot.score( data.drop('Class', axis=1).loc[validation_indices].values, data.loc[validation_indices, 'Class'].values) # Export the pipeline (generations) as pipeline.py tpot.export('pipeline.py')
# scores = cross_val_score(clf, train_data, train_lable, cv=5, verbose=5) # print(scores.mean()) # 0.8927205970809264 # clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) # scores = cross_val_score(clf, train_data, train_lable, cv=5, verbose=5) # print(scores.mean()) # clf = autosklearn.classification.AutoSklearnClassifier() # scores = cross_val_score(clf, train_data, train_lable, cv=5, verbose=5) # print(scores.mean()) # clf = autokeras.classifier() clf = tpot.TPOTClassifier(verbosity=3, periodic_checkpoint_folder="tpot", warm_start=True) clf.fit(train_data, train_lable) test_data, test_ID = handle_data(test_data, test_data_handled_path, need_shuffle=False) # print(test_data) # print(test_ID) predict = clf.predict(test_data) # print(predict) result = pd.DataFrame({'ID': test_ID, 'pred': predict}) result.to_csv(result_data_path, index=False) clf.export('tpot_pipeline.py')
'learning_rate': np.arange(0.0001,0.1,0.0001), 'n_estimators': np.arange(1,200,1), 'nthread':[6], 'gamma':np.arange(0.00001,0.1,0.00001), 'subsample':np.arange(0.1,2,0.1), 'reg_lambda': np.arange(0.1,200,1), 'reg_alpha': np.arange(1,200,1), 'min_child_weight': np.arange(1,200,1), 'colsample_bytree': np.arange(0.1,2,0.1), 'colsample_bylevel': np.arange(0.1,2,0.1) }} tpot_estimator = tpot.TPOTClassifier(generations=5, population_size=100, offspring_size=250, verbosity=2, early_stop=3, random_state=1, config_dict=dict, cv = 5, scoring = 'accuracy') tpot_estimator.fit(X_train, y_train) print(tpot_estimator.evaluated_individuals_) print(tpot_estimator.fitted_pipeline_) print(tpot_estimator.score(X_train, y_train)) print(tpot_estimator.score(X_eval, y_eval)) titanic_test = pd.read_csv(os.path.join(dir, 'test.csv')) print(titanic_test.info()) titanic_test['Age_imputed'] = age_imputer.transform(titanic_test[['Age']]) titanic_test['Fare'] = fare_imputer.transform(titanic_test[['Fare']])
'n_estimators': np.arange(1, 200, 1), 'nthread': [6], 'gamma': np.arange(0.00001, 0.1, 0.00001), 'subsample': np.arange(0.1, 2, 0.1), 'reg_lambda': np.arange(0.1, 200, 1), 'reg_alpha': np.arange(1, 200, 1), 'min_child_weight': np.arange(1, 200, 1), 'colsample_bytree': np.arange(0.1, 2, 0.1), 'colsample_bylevel': np.arange(0.1, 2, 0.1) } tpot_estimator = tpot.TPOTClassifier( generations=5, population_size=100, offspring_size=250, verbosity=2, early_stop=3, config_dict={'xgboost.XGBClassifier': params}, cv=5, scoring='accuracy') tpot_estimator.fit(X_train, y_train) print(tpot_estimator.score(X_train, y_train)) print(tpot_estimator.fitted_pipeline_) print(tpot_estimator._optimized_pipeline) print(tpot_estimator.evaluated_individuals_) titanic_test1 = titanic[titanic_train.shape[0]:] X_test = utils.select_features(rf_final_estimator, titanic_test1, threshold='mean')