def run_example(self): train = pd.read_csv("./data/churn-train.csv") #dummy_train = pd.get_dummies(train[categorical_cols]) categorical_feature_mask = train.dtypes == object categorical_cols = train.columns[categorical_feature_mask].tolist() le = LabelEncoder() #le.fit(train[categorical_cols]) #le.transform(train[categorical_cols]) train[categorical_cols] = train[categorical_cols].apply( lambda col: le.fit_transform(col)) # numpy X_train = train.drop(columns=['churn_probability']).to_numpy() y_train = train["churn_probability"].to_numpy() test = pd.read_csv("./data/churn-test.csv") #dummy_new = pd.get_dummies(test[categorical_cols]) test[categorical_cols] = test[categorical_cols].apply( lambda col: le.fit_transform(col)) X_test = test.drop(columns=['churn_probability']).to_numpy() y_test = test["churn_probability"].to_numpy() tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42, scoring='neg_mean_absolute_error', cv=5) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') return tpot.score(X_test, y_test)
def train_tpot(l=None): # can also do directly from the command line if l is None: l = get_data() model = TPOTRegressor( config_dict=None, crossover_rate=0.1, cv=5, disable_update_check=False, early_stop=None, generations=100, max_eval_time_mins=5, max_time_mins=None, memory=_tpot_cache, mutation_rate=0.9, n_jobs=-1, offspring_size=None, periodic_checkpoint_folder='tpot_periodic_checkpoint', population_size=100, random_state=None, scoring=None, subsample=1.0, use_dask=False, verbosity=1, warm_start=False) model.fit(l.X_train.copy(), l.y_train.copy()) # to be safe: model.export('tpot_exported_pipeline.py') return attributedict_from_locals('model')
def train_and_pickle_best_model(target, X, y, val_X, val_y): print('AutoML Search for good model for {}'.format(target)) pipeline_optimizer = TPOTRegressor( generations=10, population_size=150, cv=3, random_state=0xDEADBEEF, verbosity=3, scoring='r2', n_jobs=-1, early_stop=5, periodic_checkpoint_folder='tpot_checkpoint') pipeline_optimizer.fit(X, y) new_preds = pipeline_optimizer.predict(val_X) mae = mean_absolute_error(val_y, new_preds) rmse = sqrt(mean_squared_error(val_y, new_preds)) r2 = r2_score(val_y, new_preds) print("TPOT mae:", mae) print("TPOT rmse:", rmse) print("TPOT R^2 score:", r2) pipeline_optimizer.export( 'models/tpot_exported_pipeline_{}.py'.format(target)) dump(pipeline_optimizer.fitted_pipeline_, 'models/{}-best-model-automl.joblib'.format(target)) return r2, mae, rmse
def train_tpot(name, X, y, gen, cores): test_name = str('gen_' + str(gen) + name + '_' + time.strftime('%y%m%d')) print('Training with TPOT .... ', test_name) t1 = time.time() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=gen, population_size=50, verbosity=2, n_jobs=cores) tpot.fit(X_train, y_train.reshape(-1, )) print(tpot.score(X_test, y_test)) t2 = time.time() delta_time = t2 - t1 print('Time to train...:', delta_time) print('Saving the model ...') tpot.export('trained_models/' + test_name + '.py') joblib.dump(tpot.fitted_pipeline_, 'trained_models/' + test_name + '.pk1') print(test_name, ' saved ... ')
def tpot(use_dask=True): # TODO: Add some documentation... # TODO: Investigate why tpot crashes when uing Dask (probably a RAM problem). if use_dask: client = Client("tcp://192.168.1.94:8786") print(client) tpot_reg = TPOTRegressor(generations=TPOT_GENERATIONS, population_size=TPOT_POPULATION_SIZE, random_state=SEED, cv=CV, use_dask=use_dask, verbosity=2, memory="auto") df = pd.read_csv("elo/data/augmented_train.csv") print(df.sample(5)) # TODO: Find a better way to impute inf and missing values. df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(df.median()) X = df.drop(FEATS_EXCLUDED, axis=1, errors='ignore').values y = df.loc[:, "target"].values if use_dask: with ProgressBar() as pbar, Profiler() as prof: tpot_reg.fit(X, y) else: tpot_reg.fit(X, y) export_path = str( Path('elo/data/tpot_few_generations_augmented_dataset.py').absolute()) tpot_reg.export(export_path) return tpot_reg
def tpot_test(conf): from tpot import TPOTRegressor from sklearn.model_selection import train_test_split from sklearn.model_selection import TimeSeriesSplit p.load_config(conf) ds = dl.load_price_data() ds = add_features(ds) X = ds[p.feature_list][:-1] y = ds['DR'].shift(-1)[:-1] # Split Train and Test X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2) tpot = TPOTRegressor(n_jobs=-1, verbosity=2, max_time_mins=60, cv=TimeSeriesSplit(n_splits=3)) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('./tpot_out.py')
def tpotregressor(self, i): print("TPOTRegressor") tpot = TPOTRegressor(generations=50, population_size=50, verbosity=2, random_state=42) tpot.fit(self.X, self.y) #print(tpot.score(X, y_test2)) tpot.export('tpot_covid_pipeline_day_' + str(i) + '.py') print("\n") return None
def tpotRegressor(train_data, target_value): regressor = TPOTRegressor() X_train, X_test, y_train, y_test = train_test_split( train_data, train_data[target_value], train_size=0.75, test_size=0.25) regressor.fit(X_train, y_train) score = regressor.score(X_test, y_test) regressor.export('my_pipeline.py') return regressor, score
def auto_ml(X_train, X_test, y_train, y_test): tpot = TPOTRegressor(generations=30, population_size=200, verbosity=2, periodic_checkpoint_folder="tpot_checkpoint/") tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_pipeline.py')
def tpot_regression(x_calib, y_calib, x_prod, y_prod, results_direct, cv_folds, error_metric, num_jobs, gens, pop, mins, mins_per_pipeline, verbose, early_stop_generations, tpot_config_dict, model_name='tpot_best'): checkpoint_folder = results_direct + 'checkpoint_folder/' if not Path(checkpoint_folder).is_dir(): os.mkdir(checkpoint_folder) ml_model = TPOTRegressor(generations=gens, population_size=pop, scoring=error_metric, max_time_mins=mins, cv=cv_folds, verbosity=verbose, n_jobs=num_jobs, early_stop=early_stop_generations, max_eval_time_mins=mins_per_pipeline, config_dict=tpot_config_dict, periodic_checkpoint_folder=checkpoint_folder) ml_model.fit(x_calib, y_calib) # save entire pipeline ml_model.export(results_direct + model_name + '.py') joblib.dump(ml_model.fitted_pipeline_, results_direct + model_name + '.sav') # for cross valdation errors see the exported model py file # production - results and errors y_prod_predict = ml_model.predict(x_prod) np.save(results_direct + model_name + '_prod_predicted.npy', y_prod_predict) df_prod_errors = pd.DataFrame(index=[ 'Mean Squared Error', 'Median Absolute Error', 'Correlation Coefficient', 'R2' ]) df_prod_errors['TPOT Best'] = [ mean_squared_error(y_prod, y_prod_predict), median_absolute_error(y_prod, y_prod_predict), np.corrcoef(y_prod, y_prod_predict)[0][-1], r2_score(y_prod, y_prod_predict) ] df_prod_errors.to_csv(results_direct + model_name + '_prod_errors.csv')
def go_tpot(): from tpot import TPOTRegressor import datetime tpot = TPOTRegressor(generations=5, population_size=20, verbosity=3, scoring='mean_absolute_error') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('../models/tpot_pipeline_' + datetime.datetime.now().strftime('%Y.%m.%d_%H%M%S') + '.py')
def tpotting(): from tpot import TPOTRegressor """deprecated""" for target in range(4): # replace with real targets tpot = TPOTRegressor( verbosity=2, cv=5, random_state=2017, n_jobs=4, periodic_checkpoint_folder='out/out_{}'.format(target)) tpot.fit(tra_df['x_cols'], tra_df[target]) tpot.export('out/tpotted_{}.py'.format(target))
def regression(): housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py')
def regression(self, timeMax=60): def rmse_scorer(y_true, y_pred): return mean_squared_error(y_true, y_pred, squared=False) my_custom_scorer = make_scorer(rmse_scorer, greater_is_better=False) print(f"Starting regression with {self.modelName}") X_train, X_test, y_train, y_test = self.dataFunction( preprocessed=self.preprocessed, specifics="TPOT", trainSize=self.trainSize, nDataPoints=self.nDataPoints) # Change dict for prediction model config_copy = regressor_config.copy() config_copy.update(self.model) # TPOT automated feature engineering start_time = time.time() tpot = TPOTRegressor(generations=self.generations, population_size=self.popSize, verbosity=2, config_dict=config_copy, max_time_mins=timeMax, max_eval_time_mins=30, cv=4, scoring=my_custom_scorer) tpot.fit(X_train, y_train) total_time = int(divmod(time.time() - start_time, 60)[0]) print(tpot.evaluated_individuals_) print(f"Time: {total_time}") # prediction score predictionScore = int(-tpot.score(X_test, y_test)) print(f"Final MSE prediction score: {predictionScore}") # Export model tpot.export( f'{self.savePath}/time{total_time}_score{predictionScore}_trainSize{self.trainSize}_PIPE.py' ) # Export History with open(f'{self.savePath}/performance_history.pkl', "wb") as handle: pickle.dump(tpot.evaluated_individuals_, handle) # Export pareto front with open(f'{self.savePath}/PARETO.pkl', "wb") as handle: pickle.dump(tpot.pareto_front_fitted_pipelines_, handle)
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset, id_name, target_name): tp = TPOTRegressor(verbosity=2) start_time = timer(None) tp.fit(X_train, y_train) tp.export('tpot_pipeline_dont_overfit.py') time = timer(start_time) preds = tp.predict(X_test) time_out = open(name_dataset + '_' + 'tpot', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({id_name: id_test, target_name: preds}) submission.to_csv('submission_' + name_dataset + '_' + 'tpot.csv', index=False)
def model_selection_and_HPO(dataframe, target="job_performance", test_size=0.25, r_seed=123): """ Pass in the dataframe that has gone through feature selection Uses the TPOT regressor module from TPOT to perform MS and HPO. As this modeling uses some element of stochasticity, it may provide different results every time. The longer you run this, the more similar the final models will look like in the end. Finally outputs a .py file with the selected model and its hyperparameters, for which we can import. """ import TPOT from sklearn.model_selection import train_test_split import timeit from tpot import TPOTRegressor from sklearn.metrics import ( confusion_matrix, roc_auc_score, precision_recall_fscore_support, accuracy_score, ) # train test split X_train, X_test, y_train, y_test = train_test_split( dataframe.loc[:, dataframe.columns != target].values, dataframe[target].values.ravel(), test_size=test_size, random_state=r_seed) y_train = y_train.ravel() y_test = y_test.ravel() # model selection and hyperparameter optimization with TPOT Regressor tpot_regressor = TPOTRegressor(generations=20, population_size=50, cv=10, random_state=r_seed, verbosity=2, memory='auto') start_time = timeit.default_timer() tpot_regressor.fit(X_train, y_train) y_pred = tpot_regressor.predict(X_test) end_time = timeit.default_timer() print(f"Total runtime for the Employee dataset: {end_time-start_time}s") print("TPOT Score: {}".format(tpot_regressor.score(X_test, y_test))) tpot_regressor.export('tpot_exported_pipeline.py')
def train_gpr_tpot(l=None): # with auto tuning if l is None: l = get_data() config_dict = { 'sklearn.gaussian_process.GaussianProcessRegressor': { 'alpha': np.logspace(-10, 1, 12), }, 'sklearn.pipeline.FeatureUnion': {}, 'sklearn.preprocessing.QuantileTransformer': {}, 'sklearn.preprocessing.MinMaxScaler': {}, # 'competitions.MyGP': { # 'alpha':np.logspace(-10, 1, 12), # 'mu_x': np.logspace(-1, 2, 4), # 'mu_y': np.logspace(-1, 2, 4), # } } model = TPOTRegressor( config_dict=config_dict, crossover_rate=0.1, cv=5, disable_update_check=False, early_stop=None, generations=10, max_eval_time_mins=5, max_time_mins=None, # memory=os.path.join(_mydir, 'tpot_cache'), mutation_rate=0.9, n_jobs=-1, offspring_size=None, # periodic_checkpoint_folder='periodic_checkpoint_gpr_tpot', population_size=100, random_state=None, scoring=None, subsample=1.0, use_dask=False, verbosity=3, warm_start=False) model.fit(l.X_train.copy(), l.y_train.copy().squeeze()) model.export('tpot_gpr.py') return attributedict_from_locals('model')
def TPOTRegressor(ATM): X = ATM.inputs["X"] y = ATM.inputs["y"] tpot = TPOTRegressor(generations=ATM.props["generations"], population_size=ATM.props["population_size"], verbosity=ATM.props["verbosity"], random_state=ATM.props["random_state"]) tpot.fit(X, y) ATM.report({ 'name': "stats", 'stats': { 'score': tpot.score(payload.X_test, y_test) } }) ATM.report({ 'name': "log", 'payload': { 'model': tpot.export() } }) ATM.save("model.tpot", tpot.export())
def show_data(dataset_train, classifier_name, params): st.write("Training dataset:", dataset_train) X = dataset_train.values[:, 1:] y = dataset_train.values[:, 0] st.write('Shape of dataset:', X.shape, '=> ', X.shape[0], 'rows and ', X.shape[1], 'columns of dataset') st.write(f'Classifier = {classifier_name}', '=> model to train the dataset') generation = params['2.1 Tune parameter: Generation (Epoch)'] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTRegressor(generations=generation, population_size=50, verbosity=2, random_state=42) #generations=5 tpot.fit(X_train, y_train) #st.write('Info for reference only:', tpot.fit(X_train, y_train)) #print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') #tpot.log('tpot_progress_content.txt') MSE = abs(tpot.score(X_test, y_test)) st.write("MSE (Mean Squared Error):", MSE.round(2)) #st.write(tpot.evaluated_individuals_) # save the model to disk #model=tpot #pickle.dump(model, open(filename, 'wb')) #from joblib import dump, load #dump(tpot, 'filename.joblib') #https://github.com/EpistasisLab/tpot/issues/11#issuecomment-341421022 pickle.dump(tpot.fitted_pipeline_, open(filename, 'wb'))
def build_model(self, X, y): # Perform missing value imputation as scikit-learn models can't handle NaN's nan_imputer = SimpleImputer(missing_values=np.nan, strategy="mean") X = nan_imputer.fit_transform(X) pipeline_optimizer = TPOTRegressor( generations=self.config.generations, population_size=self.config.population_size, offspring_size=self.config.offspring_size, mutation_rate=self.config.mutation_rate, crossover_rate=self.config.crossover_rate, scoring=self.config.scoring, cv=self.config.cv, subsample=self.config.subsample, n_jobs=-1, max_time_mins=self.config.max_time_mins, max_eval_time_mins=self.config.max_eval_time_mins, random_state=self.config.seed, config_dict=self.config.classifier_config_dict, warm_start=self.config.warm_start, memory=self.config.artifacts_directory, verbosity=1) # Fit TPOT to data pipeline_optimizer.fit(X, y) self.logger.info(f"Finished running TPOT optimization pipeline.") # Export fitted pipeline to artifacts directory pipeline_path = os.path.join(self.config.artifacts_directory, "TPOT_pipeline.py") pipeline_optimizer.export(pipeline_path) self.logger.info(f"Saving best pipeline to {pipeline_path}") # Create new pipeline which contains nan_imputer pipe = Pipeline([ ("nan_imputer", nan_imputer), ("tpot_pipeline", pipeline_optimizer.fitted_pipeline_), ]) return pipe
def ensemble_tpot(city, state, target, horizon, lookback): with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp: clusters = pickle.load(fp) data, group = get_cluster_data(city, clusters=clusters, data_types=DATA_TYPES, cols=PREDICTORS) casos_est_columns = ['casos_est_{}'.format(i) for i in group] casos_columns = ['casos_{}'.format(i) for i in group] data = data.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) tgt_full = data_lag[target].shift(-(horizon - 1))[:-(horizon - 1)] tgt = tgt_full[:len(X_train)] tgtt = tgt_full[len(X_train):] model = TPOTRegressor(generations=20, population_size=100, verbosity=2, n_jobs=32) model.fit(X_train, target=tgt) model.export('tpot_{}_pipeline.py'.format(city)) print(model.score(X_test[:len(tgtt)], tgtt)) pred = plot_prediction(X_data[:len(tgt_full)], tgt_full, model, 'Out_of_Sample_{}_{}'.format(horizon, city), horizon) plt.show() return pred
def run_tpot(random_index): lat, lon = df_train[['lat', 'lon']].drop_duplicates().values[random_index] df_train_gridcell = df_train.loc[df_train.lat == lat].loc[df_train.lon == lon] df_test_gridcell = df_test.loc[df_test.lat == lat].loc[df_test.lon == lon] y_train = df_train_gridcell[output].values y_test = df_test_gridcell[output].values emulator = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=123, use_dask=True, n_jobs=-1, scoring='r2', config_dict=tpot_config, cv=10) emulator.fit(X_train, y_train) emulator.export(path + 'tpot_emulator_pipeline_' + output + '_' + str(lat) + '_' + str(lon) + '.py') return f"test/holdout r2 = {emulator.score(X_test, y_test):.4f}"
print(f"Failed setting training data: {e}") return return mm_training.training_df, mm_training.feature_column_list, mm_training.target_column_list feature_minutes_list = [1, 3, 5, 8, 11, 14, 18, 22, 30, 60, 120, 1440] features_df, feature_cols, target_col_list = features(feature_minutes_list) features_df = features_df[:-14] # Split for last 4.5 hours training and adjust for look ahead #X_train, y_train = features_df[-300:-20][feature_cols], features_df[-300:-20][target_col] #X_test, y_test = features_df[-10:][feature_cols], features_df[-10:][target_col] # Split for last x days training and adjust for look ahead days_training = 400 * -1440 hours_test = 120 * -60 X_train, y_train = features_df[days_training:( hours_test - 14)][feature_cols], features_df[days_training:(hours_test - 14)][target_col_list[0]] X_test, y_test = features_df[hours_test:][feature_cols], features_df[ hours_test:][target_col_list[0]] tpot = TPOTRegressor(generations=5, population_size=10, verbosity=2, n_jobs=-1) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export( f'tpot_{days_training/-1440}days_train_{hours_test/-60}hour_test_pipeline.py' )
test = combi[train.shape[0]:] test.drop('Item_Outlet_Sales',axis=1,inplace=True) ## removing id variables tpot_train = train.drop(['Outlet_Identifier','Item_Type','Item_Identifier'],axis=1) tpot_test = test.drop(['Outlet_Identifier','Item_Type','Item_Identifier'],axis=1) target = tpot_train['Item_Outlet_Sales'] tpot_train.drop('Item_Outlet_Sales',axis=1,inplace=True) # finally building model using tpot library from tpot import TPOTRegressor X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export(data+'tpot_boston_pipeline.py') ## predicting using tpot optimised pipeline tpot_pred = tpot.predict(tpot_test) sub1 = pd.DataFrame(data=tpot_pred) #sub1.index = np.arange(0, len(test)+1) sub1 = sub1.rename(columns = {'0':'Item_Outlet_Sales'}) sub1['Item_Identifier'] = test['Item_Identifier'] sub1['Outlet_Identifier'] = test['Outlet_Identifier'] sub1.columns = ['Item_Outlet_Sales','Item_Identifier','Outlet_Identifier'] sub1 = sub1[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']] sub1.to_csv('tpot.csv',index=False)
ensemble2.fit(X_train2, y_train2) predvot2 = ensemble2.predict(X_test2).round(0) MSE6 = mse(y_test2, predvot2) print("Average error on new number of hospitalizations per day:", round(MSE6**0.5, 0)) print(MSE6) print('OK') print("TPOTRegressor") tpot = TPOTRegressor(generations=50, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train2, y_train2) print(tpot.score(X_test2, y_test2)) tpot.export('tpot_covid_pipeline.py') print("Neural Network") X_trainNN = X_train2.values.reshape(X_train2.shape[0], X_train2.shape[1], 1) y_trainNN = y_train2.values X_testNN = X_test2.values.reshape(X_test2.shape[0], X_test2.shape[1], 1) y_testNN = y_test2.values NNmodel = Sequential() #NNmodel.add(layers.Dense(215, input_shape=(X_trainNN.shape[0], X_trainNN.shape[1]))) NNmodel.add( layers.LSTM(units=22, activation='tanh', return_sequences=True, input_shape=X_trainNN.shape[1:])) NNmodel.add(layers.LSTM(units=10, activation='tanh', return_sequences=False)) NNmodel.add(layers.Dense(1, activation="linear"))
return [positive_train_x, negative_train_x], [positive_train_y, negative_train_y], ["average_loan_positive", "average_loan_negative"] def split_data(X, Y): x_set, y_set, total_prefix = split_by_average_loan(X, Y) return x_set, y_set, total_prefix if __name__ == '__main__': X = pd.DataFrame() Y = pd.DataFrame() for i in range(10, 11): x = pd.DataFrame(pd.read_csv("../train/train_x_offline_{}.csv".format(i))) y = pd.DataFrame(pd.read_csv("../train/train_y_offline_{}.csv".format(i))) X = pd.concat([X, x]) Y = pd.concat([Y, y]) X.pop("uid") Y.pop("uid") X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=20, population_size=40, cv=2, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export("tpot_boston_pipeline_12.9.py")
def model_dev(train_set,matchups,spreads): """ Create the testing set for the algo creation """ # Create a sample set to pass into the machine learning algorithm X = train_set[['rush_attempt_diff', 'turn_diff', 'yards_diff', 'third_diff', 'sack_diff', 'sack_ydiff', 'poss_diff', 'p_attempt_diff']].copy() # X = df[['poss_diff', 'third_diff', 'turn_diff', 'pass_diff', 'rush_diff']].copy() # Create results vector (a home win = 1, a home loss or tie = 0) train_set.rename(columns={'result_spread':'class'},inplace=True) y = train_set['class']#np.array(np.where(df['home_score'] > df['away_score'], 1, 0)) """ Train, test, and predict the algorithm """ # Scale the sample data scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Delete the dataframe to clear memory del train_set # Split out training and testing data sets X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.25,random_state=0) # alphas = [0.1, 0.3, 0.9, 1.0, 1.3, 1.9, 2.0, 2.3, 2.9] # for alpha in alphas: # reg = linear_model.Ridge(alpha = alpha) # reg.fit(X_train,y_train) # print 'alpha = ',alpha,', score = ',reg.score(X_test,y_test) # input() pipeline_optimizer = TPOTRegressor(generations = 5, population_size = 10, random_state = 42, cv = 5, verbosity = 2, n_jobs = 3)#, scoring = 'f1') pipeline_optimizer.fit(X_train,y_train) print pipeline_optimizer.score(X_test,y_test) pipeline_optimizer.export('NFL_ML_TPOT_Regressor.py') # Remove the 'week' 'home_team' and 'away_team' columns from matchups as they are not used in the algorithm matchups.drop(['week', 'home_team', 'away_team'], axis=1, inplace=True) """ for feat in range(1,len(matchups.columns)): for c in C_vec: # Create the classifier and check the score # clf = LogisticRegression() clf = linear_model.LogisticRegression(C=c,random_state=42) selector = RFE(clf) selector = selector.fit(X_train,y_train) # Calculate probabilities using the predict_proba method for logistic regression probabilities = selector.predict_proba(scaler.transform(matchups)) # Vectorize the spread_conversion function and apply the function to the probabilities result vector vfunc = np.vectorize(spread_conversion) predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0]) # If the actual line for the home team is lower than the predicted line then you would take the away team, otherwise take the home team bet_vector = np.array(np.where(predicted_spreads > spreads,0,1)) # Create the actual result vector where a tie counts as a loss for the home team game_result = np.array(np.where(home_score.ix[:,0] + predicted_spreads[:] > away_score.ix[:,0], 1, 0)) # Check to see where the bet_vector equals the actual game result with the spread included result = np.array(np.where(bet_vector == game_result,1,0)) prob_result = float(np.sum(result)) / len(result) # print 'Number of features =', feat, 'C =',c,' Percent correct =',prob_result if prob_result > prob_val: prob_val = prob_result C_val = c feat_val = feat print 'Score =',selector.score(X_test,y_test) # print prob_val, C_val, feat clf = linear_model.LogisticRegression(C=C_val,random_state=42) clf = clf.fit(X_train,y_train) probabilities = clf.predict_proba(scaler.transform(matchups)) vfunc = np.vectorize(spread_conversion) predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0]) """ predicted_spreads = pd.DataFrame(pipeline_optimizer.predict(scaler.transform(matchups)),columns = ['results']) bet_vector = np.array(np.where(predicted_spreads > spreads,0,1)) print spreads print predicted_spreads print bet_vector
from tpot import TPOTRegressor #import data from olist.order import Order from olist.data import Olist data = Olist().get_data() training_orders = Order().get_training_data() orders = data['olist_orders_dataset'] orders['estimate_wait_time'] = (pd.to_datetime(orders['order_estimated_delivery_date'])\ - pd.to_datetime(orders['order_purchase_timestamp'])) / np.timedelta64(24, 'h') training_orders =\ training_orders.merge(orders[['estimate_wait_time', 'order_id']], on='order_id') X = training_orders.drop(['order_id', 'wait_time', 'delay_vs_expected'], axis=1) y = training_orders['wait_time'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py')
# Data Extraction df = data_extract_e('e_20190609_15.pkl') # Data Transformation and Engineering df = feature_eng(df) df = extract_queues(df) dept_encoder, queue_encoder = fit_labels(df) df = feature_transform(df, queue_encoder, dept_encoder) # Training/Test Split x, y = data_filter(df) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2468) # Using TPOT AutoML tpot = TPOTRegressor(n_jobs=-1, verbosity=1, config_dict=xgb_config.xgb_config_dict) tpot = tpot.fit(x_train, y_train) y_pred = tpot.predict(x_train) print('XGB TPOT training R2 score: ', r2_score(y_train, y_pred)) print('XGB TPOT training negative MSE: ', tpot.score(x_train, y_train)) y_pred = tpot.predict(x_test) print('XGB TPOT test R2 score: ', r2_score(y_test, y_pred)) print('XGB TPOT test negative MSE: ', tpot.score(x_test, y_test)) tpot.export('xgb_tpot.py')
import numpy as np import pandas as pd # example of tpot for the insurance regression dataset from pandas import read_csv from sklearn.model_selection import RepeatedKFold from tpot import TPOTRegressor # load dataset df = pd.read_csv( 'C:/Users/amr_r/Desktop/civil/DATASET/new3out2.csv') # load data set df.dropna(inplace=True) df.describe() X = df.iloc[:, 0:4] # y = df.iloc[:, 4] # # define evaluation procedure cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) # define search model = TPOTRegressor(generations=10, population_size=50, scoring='neg_mean_absolute_error', cv=cv, verbosity=2, random_state=1, n_jobs=-1) # perform the search model.fit(X, y) # export the best model model.export('tpot_insurance_best_model.py') #
from scipy.stats.stats import pearsonr from sklearn.model_selection import train_test_split from tpot import TPOTRegressor # ============================================================================= # # ============================================================================= train_data = pd.read_csv("../input/train.csv") test_data = pd.read_csv("../input/test.csv") train = autoclean(train_data) test = autoclean(test_data) # ============================================================================= # # ============================================================================= X_train, X_test, y_train, y_test = train_test_split(train.SalePrice, train.drop('SalePrice', axis=1), train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py')
plt.show() # %% tpot testSL = to_supervised(test, n_input, n_outputs) trainSL = to_supervised(train, n_input, n_outputs) testSL[0].shape = (testSL[0].shape[0], testSL[0].shape[1] * testSL[0].shape[2]) trainSL[0].shape = ( trainSL[0].shape[0], trainSL[0].shape[1] * trainSL[0].shape[2], ) (X_train, y_train) = trainSL (X_test, y_test) = testSL tpot = TPOTRegressor(generations=20, population_size=100, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export("tpot_boston_pipeline.py") # %% Plot predictions = tpot.predict(X_test) plt.plot(np.squeeze(predictions), label="Predictions") plt.plot(np.array(test)[-1 * predictions.shape[0]:][:, 0], label="dlGDP_csa") plt.title("dlGDP forecasts") plt.ylabel("dlGDP_csa") plt.xlabel("Quarter") plt.legend(loc="upper left") plt.show()