def Model(Encoding, Scores, Run_name, step_size, loop_dict, var_dict, round_data, ElasticNet_dict, l1_ratios, All_data): Pearson_correlations = [] Data = Encoding.copy() #copy, so it does not change# Data_sets = CV_split(Data, 5) # The Big 5# for cv_round in range(len(Data_sets)): score_dict = Scores.copy() #Randomized scores at the start each time# Test_set = Data_sets[cv_round] Train_set = exclude(Data_sets, cv_round) #Keeps everything but the train set# Train_set = pd.concat(Train_set) #All train sets into on dataframe# X = Train_set.iloc[:, :Train_set.shape[1] - 1] #features# X['Intercept'] = 1 #add intercept# y = pd.DataFrame(Train_set['pMeas']) #targets# AM_EndOfLoopError = [] AM_EndOfLoopError.append(Get_Error( X, y, score_dict)) # The Error Before AM Tuning # """AM Tuning Looping Starts Here and Adds a value to End of Loop Error""" Loop_num = 1 # AM_EndOfLoopError.append( Amplitude_Tuning(X, y, step_size, score_dict, Loop_num, Run_name, cv_round, loop_dict, var_dict)) round_data[cv_round] = loop_dict while ((AM_EndOfLoopError[-1] - AM_EndOfLoopError[-2]) / (AM_EndOfLoopError[-2])) < -0.001: Loop_num += 1 AM_EndOfLoopError.append( Amplitude_Tuning(X, y, step_size, score_dict, Loop_num, Run_name, cv_round, loop_dict, var_dict)) round_data[cv_round] = loop_dict loop_dict['AM Time Series Data'] = AM_EndOfLoopError loop_dict['Final Scores'] = score_dict """ AM Tuning is now Finished for the CV_split, Elastic Net is Next """ EN = ElasticNetCV(l1_ratio=l1_ratios, cv=5, copy_X=True, normalize=True, random_state=23) X_train = X.copy() X_train.replace(score_dict, inplace=True) y_train = y.copy() X_test = Test_set.iloc[:, :Test_set.shape[1] - 1] X_test.replace(score_dict, inplace=True) X_test['Intercept'] = 1 y_test = pd.DataFrame(Test_set['pMeas']) EN.fit(X_train, y_train) y_pred = pd.DataFrame(EN.predict(X_test)) Pearson_correlations.append(np.corrcoef(y_test.T, y_pred.T)[0][1]) """Save Everything """ ElasticNet_dict["y_pred"] = y_pred ElasticNet_dict['y_test'] = y_test ElasticNet_dict['Alpha'] = EN.alpha_ ElasticNet_dict['l1_ratio'] = EN.l1_ratio_ ElasticNet_dict['Parameters'] = EN.get_params() ElasticNet_dict["AlphaSpace"] = EN.alphas_ loop_dict['ElasticNet'] = ElasticNet_dict round_data[cv_round] = loop_dict All_data[Run_name] = round_data np.save("All Data.npy", All_data) return np.mean(Pearson_correlations)
def train(self, cvs, init_params=[], FS=False, inner_jobs=1): print('training with deap...') X = np.vstack((cvs[0][0], cvs[0][2], cvs[0][4])) if len(cvs[0][1].shape) == 1 and len(cvs[0][5].shape) == 1: y = np.hstack((cvs[0][1], cvs[0][3], cvs[0][5])) else: y = np.vstack((cvs[0][1], cvs[0][3], cvs[0][5])).ravel() self.D, self.N = X.shape if 'elasticnet' in str.lower(self.model_type): X_train = cvs[0][0] y_train = cvs[0][1].reshape(-1, 1) X_val = cvs[0][2] y_val = cvs[0][3].reshape(-1, 1) X_test = cvs[0][4] y_test = cvs[0][5].reshape(-1, 1) X_train = np.vstack((X_train, X_val, X_test)) y_train = np.vstack((y_train, y_val, y_test)) model = ElasticNetCV(cv=5, max_iter=4000) model.fit(X_train, y_train.ravel()) self.best_params = model.get_params() ypred = model.predict(X_test).ravel() if self.rated is None: self.accuracy = np.mean(np.abs(ypred - y_test.ravel()) / y_test.ravel()) else: self.accuracy = np.mean(np.abs(ypred - y_test.ravel())) self.acc_test = self.accuracy self.model = model self.logger.info('Best params') self.logger.info(self.best_params) self.logger.info('Final mae %s', str(self.acc_test)) self.logger.info('Final rms %s', str(self.accuracy)) self.logger.info('finish train for model %s', self.model_type) self.istrained = True self.save(self.model_dir) return self.to_dict() else: if 'xgb' in str.lower(self.model_type): params = {'learning_rate': np.logspace(-5, -1, num=6, base=10), 'max_depth': np.unique(np.linspace(1, 150, num=50).astype('int')), 'colsample_bytree': np.linspace(0.4, 1.0, num=60), 'colsample_bynode': np.linspace(0.4, 1.0, num=60), 'subsample': np.linspace(0.2, 1.0, num=6), 'gamma': np.linspace(0.001, 2, num=20), 'reg_alpha': np.linspace(0, 1.0, num=12)} model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42) ngen = self.static_data['sklearn']['gen'] npop = self.static_data['sklearn']['pop'] elif 'rf' in str.lower(self.model_type): if FS: params = { 'max_depth': [1, 2, 3, 5, 10, 16, 24, 36, 52, 76, 96, 128, 150], } model = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42, max_features=2 / 3) ngen = 2 npop = 4 else: params = { 'max_depth': np.unique(np.linspace(1, 130, num=50).astype('int')), 'max_features': ['auto', 'sqrt', 'log2', None, 0.8, 0.6, 0.4], 'min_samples_leaf': np.unique(np.linspace(1, 20, num=20).astype('int')), 'min_samples_split': np.unique(np.linspace(2, 100, num=20).astype('int')), } model = RandomForestRegressor(n_estimators=500, random_state=42) ngen = self.static_data['sklearn']['gen'] npop = self.static_data['sklearn']['pop'] elif str.lower(self.model_type) == 'svm': params = {'C': np.logspace(-2, 3, num=100, base=10), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': list(np.linspace(0.001, 2, num=100)) + ['scale', 'auto']} model = SVR(max_iter=1000000) ngen = self.static_data['sklearn']['gen'] npop = self.static_data['sklearn']['pop'] elif str.lower(self.model_type) == 'nusvm': params = {'nu': np.linspace(0.01, 0.99, num=10), 'C': np.logspace(-1, 5, num=100, base=10), 'gamma': np.linspace(0.01, 10, num=100)} model = NuSVR(max_iter=1000000) ngen = self.static_data['sklearn']['gen'] npop = self.static_data['sklearn']['pop'] elif 'mlp' in str.lower(self.model_type): if not self.is_combine: params = {'hidden_layer_sizes': np.linspace(4, 800, num=50).astype('int'), 'alpha': np.linspace(1e-5, 1e-1, num=4), } else: params = {'hidden_layer_sizes': np.linspace(4, 250, num=50).astype('int'), 'activation': ['identity', 'tanh', 'relu'], 'alpha': np.linspace(1e-5, 1e-1, num=4), } model = MLPRegressor(max_iter=1000, early_stopping=True) ngen = 5 npop = self.static_data['sklearn']['pop'] if not self.path_group is None: ncpus = joblib.load(os.path.join(self.path_group, 'total_cpus.pickle')) gpu_status = joblib.load(os.path.join(self.path_group, 'gpu_status.pickle')) njobs = int(ncpus - gpu_status) cpu_status = njobs joblib.dump(cpu_status, os.path.join(self.path_group, 'cpu_status.pickle')) else: njobs = self.njobs cv = EvolutionaryAlgorithmSearchCV(estimator=model, params=params, scoring='neg_root_mean_squared_error', cv=3, rated=self.rated, verbose=1, population_size=npop, gene_mutation_prob=0.8, gene_crossover_prob=0.8, tournament_size=3, generations_number=ngen, refit=False, init_params=init_params, n_jobs=njobs, path_group=self.path_group) cv.fit(cvs) self.best_params = cv.best_params_ self.accuracy, self.acc_test = self.fit_model1(model, self.best_params, cvs) self.model = model self.model.set_params(**self.best_params) self.model.fit(X, y.ravel()) self.logger.info('Best params') self.logger.info(self.best_params) self.logger.info('Final mae %s', str(self.acc_test)) self.logger.info('Final rms %s', str(self.accuracy)) self.logger.info('finish train for model %s', self.model_type) self.istrained = True self.save(self.model_dir) return self.to_dict()
class ElasticNet: """Wrapper for SciKitLearn linear_model.ElasticNetCV to help with model optimization""" def __init__(self, x='numpy_array', y='predictor', sample_labels=None, test_split=.2, sk_elastic_net_kwargs=None, regression_site_labels=None, test_samples=None, output_name=None, output_directory=None): assert isinstance(sk_elastic_net_kwargs, dict) assert isinstance(sample_labels, list) assert isinstance(regression_site_labels, list) self.x = x self.y = y self.test_split = test_split self.en_kwargs = sk_elastic_net_kwargs # test_container/ train_container [array, outcomes, labels] self.test_container = [[], [], []] self.train_container = [[], [], []] self.en_model = None self.sample_labels = sample_labels self.regression_site_labels = regression_site_labels self.model_stats = None self.input_test_samples = test_samples self.output_name = output_name self.output_directory = output_directory self.run() def run(self): self.set_test_samples() if self.input_test_samples: self.test_split = 0.1 if self.test_split == 0: self.test_container = self.train_container self.fit_model() self.get_model_stats() self.model_output() def set_test_samples(self): if self.input_test_samples: test_samples = self.input_test_samples else: test_size = int(round(len(self.y) * self.test_split, 0)) test_samples = random.sample(self.sample_labels, test_size) # test_container/ validation_container [test_array, test_outcomes, labels] for count, info in enumerate(zip(self.x, self.y, self.sample_labels)): if info[2] in test_samples: self.test_container[0].append(info[0]) self.test_container[1].append(info[1]) self.test_container[2].append(info[2]) else: self.train_container[0].append(info[0]) self.train_container[1].append(info[1]) self.train_container[2].append(info[2]) self.test_container[0] = np.asarray(self.test_container[0]) self.test_container[1] = np.asarray(self.test_container[1]) self.train_container[0] = np.asarray(self.train_container[0]) self.train_container[1] = np.asarray(self.train_container[1]) def fit_model(self): self.en_model = ElasticNetCV(**self.en_kwargs).fit(self.train_container[0], self.train_container[1]) def get_model_stats(self): regression_sites = [] for site in zip(self.regression_site_labels, list(self.en_model.coef_)): if not math.isclose(site[1], 0): regression_sites.append(site[0]) model_score = self.en_model.score(self.test_container[0], self.test_container[1]) predited_values = self.en_model.predict(self.test_container[0]) self.model_stats = (regression_sites, model_score, predited_values) def model_output(self): """ """ kwarg_pair = ['Test Split:%s\n' % str(self.test_split)] for key, value in self.en_model.get_params().items(): kwarg_pair.append('%s:%s' % (key, str(value))) output_path = '%s%s' % (self.output_directory, self.output_name) joblib.dump(self.en_model, output_path + '.model') out = open(output_path + '.model_info.txt', 'w') out.write('%s\n' % self.output_name) out.write('Model Score (R^2) = %s\n' % str(self.model_stats[1])) out.write('%s\n' % '\t'.join(kwarg_pair)) out.write('Test Samples \t%s\n' % '\t'.join(stc(self.test_container[2]))) out.write('Test Samples Predicted Values \t%s\n' % '\t'.join(stc(self.model_stats[2]))) out.write('Test Samples Actual Values \t%s\n' % '\t'.join(stc(self.test_container[1]))) out.write('Training Samples \t%s\n' % '\t'.join(stc(self.train_container[2]))) out.write('Training Samples Actual Values \t%s\n' % '\t'.join(stc(self.train_container[1]))) out.write('Regression Sites \t%s\n' % '\t'.join(stc(self.model_stats[0]))) out.close()
y = pd.read_csv(y_train_path, index_col=False) y = y.iloc[:, 0] #%% # feature selection affected linear model greatly so lets automate this process # by using instead of the normal LinearRegression library lets use ElasticNet # that combines L1 and L2 regularization. This works as a kind of automatic feature # selection # lets start by optimizing the parameters in crossvalidation. There is a separate # function for this package that does this more efficiently glmnet = ElasticNetCV(cv=70, random_state=seed) glmnet.fit(X, y) glmnet_best_params = glmnet.get_params() #%% # Defining the method for crossvalidation. We crossvalidate each individual row crossvalidation = KFold(n_splits=70, shuffle=True, random_state=seed) # Defining list of scoring methods scoring = ["neg_mean_squared_error", "neg_mean_absolute_error"] #%% glmnet_model = ElasticNet() glmnet_best_params_matching = { key: glmnet_best_params[key] for key in glmnet_model.get_params().keys() if key in glmnet_best_params