def classificar(): """ Extrai dataframe da tabela, converte variáveis de texto para número e preenche vazios para -1 """ dataframe = pd.read_excel('data/data.xlsx') #dataframe.rename({'CODIFICAÇÃO': 'class'}, axis='columns', inplace=True) # Binarizando variável com multiplos niveis encoder = LabelEncoder() classe_label = encoder.fit_transform(dataframe.iloc[:, 0]) print(classe_label) # Binarizando variável com dois niveis dest_autopecas = {'N': 0, 'S': 1} dataframe['DEST AUTOPECAS'] = [dest_autopecas[item] for item in dataframe['DEST AUTOPECAS']] # Preenchendo vazios com valor padrão dataframe.fillna(-1, inplace=True) X_train, X_test, y_train, y_test = train_test_split(dataframe[LABELS].values, np.array(classe_label), test_size=0.3) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=3) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_classif_pipeline.py')
def run_AutoTpot(self): # Running the AutoTpot pipeline automl = TPOTClassifier(generations=1, verbosity=2, config_dict='TPOT sparse') automl.fit(self.train, self.y_train) # TPOT produces ready-to-run, standalone Python code for the best-performing model, # in the form of a scikit-learn pipeline. # Exporting the best models automl.export(os.path.join(self.args.save_dir, 'tpot-sportswear.py')) print('The best pipeline discovered through auto-tpot is {}'.format(automl.fitted_pipeline_)) print('Saving the best model discovered through TPOT.') # Dumping ensemble of the models joblib.dump(automl, os.path.join(self.args.checkpoint_dir, 'auto-tpot.pickle')) # Calculating time per prediction # Start time ****************************************************************************** start = timeit.default_timer() # Predicting label, confidence probability on the test data set predictions = automl.predict(self.test) predictions_prob = automl.predict_proba(self.test) # Binary class values : rounding them to 0 or 1 predictions = [round(value) for value in predictions] end = timeit.default_timer() # End Time ****************************************************************************** print('Time per prediction : {}'.format((end - start) / self.test.shape[0])) self.visualize(predictions, automl)
def cli(erv_data): # import the ERV expression data as a Pandas dataframe df = pd.read_csv(erv_data) class_codes = dict(enumerate( df['class'].astype("category").cat.categories)) df["class"] = df["class"].astype("category").cat.codes # create the test and training data X_train, X_test, y_train, y_test = train_test_split(df.values[:, 2:], df.values[:, 1], train_size=0.75, test_size=0.25) # convert them all to floats X_train, X_test, y_train, y_test = X_train.astype(float), X_test.astype( float), y_train.astype(float), y_test.astype(float) # create a pipeline pipeline_optimizer = TPOTClassifier(cv=2, verbosity=2, n_jobs=-1) pipeline_optimizer.fit(X_train, y_train) pipeline_optimizer.export('tpot_exported_pipeline.py') print(f"Validation Accuracy: {pipeline_optimizer.score(X_test, y_test)}") cm = ConfusionMatrix([class_codes[y] for y in y_test], [ class_codes[y] for y in [pipeline_optimizer.predict(x.reshape(1, -1))[0] for x in X_test] ]) cm.save_html("report")
def main(): df_train = pd.read_csv('data/train_data.csv') df_valid = pd.read_csv('data/valid_data.csv') feature_cols = list(df_train.columns[:-1]) target_col = df_train.columns[-1] X_train = df_train[feature_cols].values y_train = df_train[target_col].values X_valid = df_valid[feature_cols].values y_valid = df_valid[target_col].values tsne_data = np.load('data/tsne_2d_5p.npz') tsne_train = tsne_data['X_train'] tsne_valid = tsne_data['X_valid'] # concat features X_train_concat = np.concatenate([X_train, tsne_train], axis=1) X_valid_concat = np.concatenate([X_valid, tsne_valid], axis=1) tpot = TPOTClassifier(max_time_mins=60 * 24, population_size=100, scoring='log_loss', num_cv_folds=3, verbosity=2, random_state=67) tpot.fit(X_train_concat, y_train) print(tpot.score(X_valid_concat, y_valid)) tpot.export('tpot_pipeline.py')
def run_tpot(zeros, ones): all_data, y = make_all_data(zeros, ones) X_train, X_test, y_train, y_test = train_test_split(all_data, y, test_size=.1) pca = PCA(n_components=15) X_train = pca.fit_transform(X_train) X_test = pca.fit_transform(X_test) # if not os.path.exists('tpot_checkpoint'): # os.mkdir('tpot_checkpoint') tpot = TPOTClassifier( n_jobs=-1, generations=50, verbosity=3, scoring='f1', # subsample=.5, # periodic_checkpoint_folder='tpot_checkpoint', max_eval_time_mins=30, memory='auto') tpot.fit(X_train, y_train) tpot.export('tpot_ecog_pipeline.py') results = tpot.predict(X_test) out_file = open('tpot_metrics.txt', 'w') out_file.write(sklearn.metrics.classification_report(y_test, results)) tpot.export('tpot_ecog_pipeline.py')
class TPot(Model): def __init__(self): print("Starting t pot!") def fit(self, X, y, title=None): # For this case, X and y are the complete datasets!!! self.pipeline_optimizer = TPOTClassifier( generations=5, cv=5, random_state=42, verbosity=3, n_jobs=8, max_eval_time_mins=1,#10, scoring='f1', subsample=0.5 ) self.pipeline_optimizer.fit(X, y) if not os.path.exists("./automl"): os.makedirs("./automl") self.pipeline_optimizer.export('./automl/tpot_exported_pipeline_' + str(title) + '_.py') def predict(self, X): pass
def main(**kwargs) -> None: # divides kwargs between `Featurizer` and `TPOTClassifier` kwargs. tpot_kwargs = {} keys = list(kwargs.keys()) for k in keys: if k in inspect.getargspec(TPOTClassifier).args: tpot_kwargs[k] = kwargs.pop(k) # loads all data into memory. paths = [os.path.join(LABELS_PATH, fname) for fname in os.listdir(LABELS_PATH)] X_raw, y = load_data(paths) X_raw.title.fillna('', inplace=True) X_raw.channel_title.fillna('', inplace=True) # splits data into train and test sets. X_train, X_test, y_train, y_test = train_test_split(X_raw, y, random_state=SEED, train_size=TRAIN_SIZE, test_size=1-TRAIN_SIZE, shuffle=True) # KLUDGE: preprocesses text deterministically (i.e. NOT part of the TPOT hyperparameter # optimization pipeline). featurizer = Featurizer(**kwargs) featurizer.fit(X_train) X_train = featurizer.transform(X_train) if 'verbosity' in tpot_kwargs and tpot_kwargs['verbosity'] > 0: print(f'Beginning hyper-parameter search with training data shape: {X_train.shape}.') tpot = TPOTClassifier(**tpot_kwargs) tpot.fit(X_train, y_train) if 'periodic_checkpoint_folder' in tpot_kwargs: tpot.export(os.path.join(tpot_kwargs['periodic_checkpoint_folder'], 'best_pipeline.py')) if 'verbosity' in tpot_kwargs and tpot_kwargs['verbosity'] > 0: X_test = featurizer.transform(X_test) print(f'Train set score: {tpot.score(X_train, y_train).round(4)}') print(f'Test set score: {tpot.score(X_test, y_test).round(4)}') return None
def tpot_generation(X_train, y_train, X_test, y_test): tpot = TPOTClassifier(generations=10, population_size=20, verbosity=2, n_jobs=4) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_quiniela_pipeline.py')
def tpotClassifier(train_data, target_value): classifier = TPOTClassifier() X_train, X_test, y_train, y_test = train_test_split( train_data, train_data[target_value], train_size=0.75, test_size=0.25) classifier.fit(X_train, y_train) score: float = classifier.score(X_test, y_test) classifier.export('my_pipeline.py') return classifier, score
def clfWithTpot(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1) my_tpot = TPOTClassifier(generations=10, verbosity=2) my_tpot.fit(np.array(X_train), np.array(y_train)) print(my_tpot.score(np.array(X_test), np.array(y_test))) my_tpot.export('exported_pipeline.py') predictions = my_tpot.predict(np.array(X_test)) print(confusion_matrix(y_test, predictions))
def main(): """ Uses tpot (Tree-based Pipeline Optimization Tool) an Automated Machine Learning tool to find and output the best machine learning model for the given dataset. See https://github.com/EpistasisLab/tpot Outputs the results to automodel.py """ titanic = pd.read_csv('../data/titanic.csv') titanic.rename(columns={'Survived': 'class'}, inplace=True) for category in ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']: print("Number of levels in category '{0}': \b {1:2.2f} ".format( category, titanic[category].unique().size)) # Encode values titanic['Sex'] = titanic['Sex'].map({'male': 0, 'female': 1}) titanic['Embarked'] = titanic['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}) # Fill na titanic = titanic.fillna(-999) pd.isnull(titanic).any() # Encode values from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() CabinTrans = mlb.fit_transform([{str(val)} for val in titanic['Cabin'].values]) # Drop unused columns titanic_new = titanic.drop( ['PassengerId', 'Name', 'Ticket', 'Cabin', 'class'], axis=1) # Create numpy arrays titanic_new = np.hstack((titanic_new.values, CabinTrans)) titanic_class = titanic['class'].values # Train test split # https://www.kdnuggets.com/2020/07/easy-guide-data-preprocessing-python.html # https://stackoverflow.com/questions/55525195/do-i-have-to-do-one-hot-encoding-separately-for-train-and-test-dataset training_indices, validation_indices = training_indices, testing_indices = train_test_split( titanic.index, stratify=titanic_class, train_size=0.75, test_size=0.25) training_indices.size, validation_indices.size # Train model tpot = TPOTClassifier(verbosity=2, max_time_mins=2, max_eval_time_mins=0.04, population_size=40) tpot.fit(titanic_new[training_indices], titanic_class[training_indices]) # Score tpot.score(titanic_new[validation_indices], titanic.loc[validation_indices, 'class'].values) # Export tpot.export('automodel.py')
def test_export(): """Assert that TPOT's export function throws a ValueError when no optimized pipeline exists""" tpot_obj = TPOTClassifier() try: tpot_obj.export("test_export.py") assert False # Should be unreachable except ValueError: pass
def tpot_train(project, X, y, export_file, prediction_type, train_size=0.75, max_time_mins=1, max_eval_time_mins=0.04, population_size=40, scoring_func=None, n_jobs=1): print( "==========train / test split for training size {}".format(train_size)) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size) print(X_train.shape, y_train.shape) print("==========Start training the model...") print("==========max_time_mins: {}".format(max_time_mins)) print("==========max_eval_time_mins: {}".format(max_eval_time_mins)) print("==========population_size: {}".format(population_size)) print("==========n_jobs: {}".format(n_jobs)) # predition type: # - regression # - classification if (prediction_type == "classification"): tpot = TPOTClassifier(verbosity=2, max_time_mins=max_time_mins, max_eval_time_mins=max_eval_time_mins, population_size=population_size, scoring=scoring_func, n_jobs=n_jobs) else: tpot = TPOTRegressor(verbosity=2, max_time_mins=max_time_mins, max_eval_time_mins=max_eval_time_mins, population_size=population_size, scoring=scoring_func, n_jobs=n_jobs, warm_start=True) tpot.fit(X_train, y_train) try: holdout_score = tpot.score(X_test, y_test) print("==========holdout set score is {}".format(holdout_score)) except: print("==========Unexpected error when score holdout set") print("==========export tpot to {}".format(export_file)) tpot.export(export_file) return tpot
def do_tpot(generations=5, population_size=10,X='',y=''): X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, test_size=0.20) tpot = TPOTClassifier(generations=generations, population_size=population_size, verbosity=2,cv=3) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_pipeline.py') return tpot
def tpot_classifiers(self, X_train, y_train, X_test, y_test, save_path): print('Training using Tpot') pipeline_optimizer = TPOTClassifier(generations=10, population_size=25, cv=3, random_state=0, verbosity=2, scoring='balanced_accuracy') pipeline_optimizer.fit(X_train, y_train) pipeline_optimizer.export(save_path + '.py') print(pipeline_optimizer.score(X_test, y_test))
def T_Pot(X_train, X_test, y_train, y_test): pipeline_optimizer = TPOTClassifier(generations=5, population_size=50, cv=5, random_state=42, verbosity=2, early_stop=3, n_jobs=-1) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('pipelineOutput.py')
def run_main(): df_train = pd.read_csv('./preprocessed_data.csv') df_test = pd.read_csv('./preprocessed_test_data.csv') X = np.array(df_train.drop(['Survived'], 1)) y = np.array(df_train['Survived']) # print(X,y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) tpot = TPOTClassifier(generations=20, verbosity=2) tpot.fit(X_train, y_train) tpot.score(X_test, y_test)#0.824626865672 tpot.export('tpot_exported_pipeline.py')
def tune(X_train, X_test, y_train, y_test): # Construct and fit TPOT classifier start_time = time.time() tpot = TPOTClassifier(generations=10, verbosity=2) tpot.fit(X_train, y_train) end_time = time.time() # Results print('TPOT classifier finished in %s seconds' % (end_time - start_time)) print('Best pipeline test accuracy: %.3f' % tpot.score(X_test, y_test)) # Save best pipeline as Python script file tpot.export('tpot_pipeline.py')
def tpot_search(X_train, X_test, y_train, y_test, target_column): pipeline_optimizer = TPOTClassifier(generations=30, population_size=30, cv=5, random_state=42, verbosity=2, config_dict='TPOT sparse') pipeline_optimizer.fit(X_train, y_train) # print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('output/tpot_exported_pipeline_' + target_column + '.py')
def tpot (X_train, y_train, X_test = None, y_test = None, export_file = '../results/models/tpot/exported_pipeline.py', n_jobs = 1): if 'node' and 'target' in X_train.columns: X_train = X_train.drop(columns = ['node', 'target']) if 'node' and 'target' in X_test.columns: X_test = X_test.drop(columns = ['node', 'target']) tpot = TPOTClassifier(generations = 5, population_size = 40, cv=3, verbosity=2, scoring = 'f1', n_jobs=6) tpot.fit(X_train, y_train) tpot.export(export_file) print(tpot.score(X_test, y_test))
def main(): df = pd.read_csv("data/hl_test_clean.csv", encoding="utf8") df['book_date'] = pd.to_datetime(df['book_date']) trainSet = df[(df['book_date'] >= '2017-04-01') & (df['book_date'] <= '2017-07-20')].reset_index(drop=True) testSet = df[(df['book_date'] >= '2017-07-20') & (df['book_date'] <= '2017-08-31')].reset_index(drop=True) logger.info( "============================Data is ready!============================" ) clf = XGBClassifier(learning_rate=0.01, max_depth=7, min_child_weight=15, n_estimators=100, nthread=1, subsample=0.6500000000000001) myexe = MyExecutor(df, "fpd", clf) #leftVaris = myexe.get_result() #leftVaris = leftVaris[leftVaris.values > 7].keys() #print(leftVaris) leftVaris = [ 'hl_call_domesitc_cnt_2m', 'hl_contact_early_morning_cnt_5m', 'hl_phone_silent_frequentcy', 'hl_contact_night_pct', 'hl_transactions_total_amt_5m', 'hl_region_call_cnt_max_uniq_num_cnt', 'hl_region_call_out_cnt_max_avg_call_in_time', 'hl_contact_morning_cnt_5m', 'hl_region_call_in_time_max_avg_call_in_time', 'hl_transactions_total_amt_2m', 'hl_contact_night_cnt_5m', 'hl_phone_num_used_time_months', 'hl_region_call_cnt_max_avg_callin_time', 'hl_region_call_in_time_max_uniq_num_cnt', 'hl_region_call_in_cnt_max_avg_call_out_time', 'hl_transactions_min_5m', 'hl_region_call_out_time_max_avg_call_out_time' ] X_train = trainSet[leftVaris].copy() y_train = trainSet['fpd'].copy() X_test = testSet[leftVaris].copy() y_test = testSet['fpd'].copy() # AutoSklearn阶段: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=4, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) # print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') getReport(pipeline_optimizer, trainSet, X_train, y_train, testSet, X_test, y_test)
def TPOT_Classifier(): tpot = TPOTClassifier( verbosity=2, max_time_mins=390, population_size=40, ) tpot.fit(x_train, y_train) tpot.export('tpot_assignment_pipeline.py') TPOT_predict = tpot.predict(x_test) score = tpot.score(x_test, y_test) print(score) print(y_test) print(TPOT_predict) return score
def find_best_model(X_train, X_test, y_train, y_test): pipeline_optimizer = TPOTClassifier( generations=100, population_size=50, cv=5, random_state=42, verbosity=2, config_dict='TPOT sparse' ) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('outputs/tpot_exported_pipeline.py')
def tpot_train(cat, X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2) tpot = TPOTClassifier(generations=15, population_size=20, verbosity=5, n_jobs=-1, scoring='roc_auc') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export(cat + '-pipeline.py')
def tpot_select_model(x_train, y_train, x_test, y_test): from tpot import TPOTClassifier # create instance tpot = TPOTClassifier(generations=10, population_size=50, verbosity=2, n_jobs=-1) # fit instance tpot.fit(x_train, y_train) # evaluate performance on test data print(tpot.score(x_test, y_test)) # export the script used to create the best model tpot.export('tpot_exported_pipeline.py')
def classification(): digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py')
def Classifier(x, y): x_train = x y_train = y tpot = TPOTClassifier( verbosity=2, max_time_mins=10, population_size=50, ) tpot.fit(x_train, y_train) tpot.export('tpot_pipeline.py') TPOT_predict = tpot.predict(x_test) score = tpot.score(x_test, y_test) #print(score) #print(y_test) #print(TPOT_predict) return score
def run_tpot(X,y, target_ft,time_budget=30, include_preprocessors=None, n_jobs=1 ): print(n_jobs) X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1) if include_preprocessors: pipeline_optimizer = TPOTClassifier(max_time_mins = time_budget//60, generations=None, use_dask=False, #template="Selector-Transformer-Classifier", n_jobs=n_jobs,) else: pipeline_optimizer = TPOTClassifier(max_time_mins = time_budget//60, generations=None, use_dask=False, template='Classifier', n_jobs=n_jobs,) pipeline_optimizer.fit(X_train, y_train) y_hat = pipeline_optimizer.predict(X_test) acc = sklearn.metrics.accuracy_score(y_test, y_hat) f1_s = sklearn.metrics.f1_score(y_test, y_hat, average='weighted') metrs = [] metrs.append("Accuracy score - " + str(acc)) metrs.append("F1 score - " + str(f1_s)) res = ["","","","",f1_s,acc,"",pipeline_optimizer.export()] return str(metrs),res
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset): tp = TPOTClassifier(verbosity=3) start_time = timer(None) tp.fit(X_train, y_train) tp.export('tpot_pipeline_dont_overfit.py') time = timer(start_time) preds = tp.predict(X_test) time_out = open(name_dataset + '_' + 'tpot', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({"id": id_test, "target": preds}) submission.to_csv(name_dataset + '_' + 'tpot' + '_submission.csv', index=False)
def tpot_optimization_clf(count, train_path, test_path, verbose=False): """ Optimize algorithms and parameters using TPOT for Classification trees. :param count: int, number of samples to be generated. :param train_path: string, path to the dataset used for training. :param test_path: string, path to the dataset used for testing. :param verbose: bool, representing if information regarding the process should be displayed. """ # Generate samples. if verbose: print("Get train samples. ") X_train, Y_train = Sampler.generate_samples(dataset=train_path, count=count) if verbose: print("Get test samples. ") X_test, Y_test = Sampler.generate_samples(dataset=test_path, count=count) tpot_config = { 'xgboost.XGBClassifier': { 'max_depth': [2, 3, 4, 5], "learning_rate": [0.02, 0.05, 0.1, 0.15, 0.2], 'n_estimators': [10, 20, 30, 40, 50, 100, 500], 'objective': ["reg:linear", "multi:softmax", "multi:softprob"], 'booster': ["gbtree", "gblinear", "dart"], 'n_jobs': [-1] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 20, 30, 40, 50, 100, 500], 'criterion': ["gini", "entropy"], 'max_features': ["auto", "sqrt", "log2"], 'max_depth': [2, 3, 4, 5], 'n_jobs': [-1] } } if verbose: print("Start TPOT optimization. ") tpot = TPOTClassifier(generations=10, population_size=30, verbosity=2, config_dict=tpot_config) tpot.fit(np.array(X_train), np.array(Y_train)) print( tpot.score(np.array(X_test, dtype=np.float64), np.array(Y_test, dtype=np.float64))) tpot.export('tpot_pipeline_clf.py')
def test_export(): """Assert that TPOT's export function throws a RuntimeError when no optimized pipeline exists.""" tpot_obj = TPOTClassifier() assert_raises(RuntimeError, tpot_obj.export, "test_export.py") pipeline_string = ( 'KNeighborsClassifier(CombineDFs(' 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5), ZeroCount(input_matrix))' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._optimized_pipeline = pipeline tpot_obj.export("test_export.py") assert path.isfile("test_export.py") remove("test_export.py") # clean up exported file
def generate_model(generations, train_X, train_y): tpot_generator = TPOTClassifier(generations=generations, verbosity=2) tpot_generator.fit(train_X, train_y) tpot_generator.export('tpot_model' + generations + '.py')
HPI = HPI.join(benchmark['United States']) # all in percentage change since the start of the data (1975-01-01) HPI.dropna(inplace=True) housing_pct = HPI.pct_change() housing_pct.replace([np.inf, -np.inf], np.nan, inplace=True) housing_pct['US_HPI_future'] = housing_pct['United States'].shift(-1) housing_pct.dropna(inplace=True) def create_labels(cur_hpi, fut_hpi): if fut_hpi > cur_hpi: return 1 else: return 0 housing_pct['label'] = list(map(create_labels, housing_pct['United States'], housing_pct['US_HPI_future'])) # housing_pct['ma_apply_example'] = housing_pct['M30'].rolling(window=10).apply(moving_average) # print(housing_pct.tail()) X = np.array(housing_pct.drop(['label', 'US_HPI_future'], 1)) y = np.array(housing_pct['label']) X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25) tpot = TPOTClassifier(generations=10, population_size=20, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('HPI_tpot_pipeline.py')
from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size = 0.75, test_size = 0.25) tpot = TPOTClassifier(generations = 5, population_size = 20, verbosity = 2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py')
#features = data #tpot_data=pd.DataFrame({'class':label},columns=['class']) #training_features, testing_features, training_classes, testing_classes = \ # train_test_split(features, tpot_data['class'], random_state=42) data,label,idx_row = np.concatenate(samples),np.concatenate(label),np.arange(0,len(label),1) print('shuffle') for ii in range(100): shuffle(idx_row) data,label = data[idx_row,:],label[idx_row] X_train, X_test, y_train, y_test = train_test_split(data,label,train_size=0.80) print('model selection') tpot = TPOTClassifier(generations=10, population_size=25, verbosity=2,random_state=373849,num_cv_folds=5,scoring='roc_auc' ) tpot.fit(X_train,y_train) tpot.score(X_test,y_test) tpot.export('%s%s_tpot_exported_pipeline.py'%(folder,type_) ) print('finished model selection') """ from sklearn.ensemble import VotingClassifier from sklearn.feature_selection import SelectFwe, f_classif from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer from sklearn.model_selection import KFold exported_pipeline = make_pipeline( make_union( FunctionTransformer(lambda X: X), FunctionTransformer(lambda X: X) ), SelectFwe(alpha=0.05, score_func=f_classif),
# Add origin encoding for origin_column in list(origin_dummies): sample_df[ origin_column ] = origin_dummies[ origin_column ] X_train, X_test, y_train, y_test = train_test_split( sample_df, labels,train_size=0.7) le = preprocessing.LabelEncoder() tpot = TPOTClassifier(generations=7, population_size=15, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_cars_pipeline.py') #tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) #Best pipeline: GradientBoostingClassifier(RobustScaler(input_matrix), GradientBoostingClassifier__learning_rate=1.0, GradientBoostingClassifier__max_depth=5, GradientBoostingClassifier__max_features=0.25, GradientBoostingClassifier__min_samples_leaf=DEFAULT, GradientBoostingClassifier__min_samples_split=17, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=0.7) # 0.770491803279 #tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) #Best pipeline: ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=False, ExtraTreesClassifier__criterion=DEFAULT, ExtraTreesClassifier__max_features=0.45, ExtraTreesClassifier__min_samples_leaf=1, ExtraTreesClassifier__min_samples_split=7, ExtraTreesClassifier__n_estimators=DEFAULT) #0.762295081967 #Sin MPG #tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) #Best pipeline: ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=DEFAULT, ExtraTreesClassifier__criterion=gini, ExtraTreesClassifier__max_features=0.45, ExtraTreesClassifier__min_samples_leaf=1, ExtraTreesClassifier__min_samples_split=6, ExtraTreesClassifier__n_estimators=DEFAULT) #0.754098360656 # All features set