def test_main(): from hyperopt import fmin, tpe, rand, anneal, mix, partial from hyperopt.mongoexp import MongoTrials trials = MongoTrials('mongo://131.220.7.92/test/jobs', exp_key='test_nnet') # note that n_startup_jobs is related to gamma, the fraction of the "good" # jobs. If gamma=.25, the default, then after the startup phase of 20 jobs, # 5 are used to build the model. from cuvnet.model_selection.test_nnet_objective import objective best = fmin(fn=objective, space=test_build_space(), trials=trials, algo=partial(mix.suggest, p_suggest=[(.0, rand.suggest), (1., anneal.suggest), (0., partial(tpe.suggest, prior_weight=1.0, # default is 1.0 n_startup_jobs=20))]), # default is 20 max_evals=200) print "best: ", best, min(TEST_LOSS) import matplotlib.pyplot as plt fig, (ax0, ax1, ax2, ax3) = plt.subplots(4, 1) c = np.arange(len(TEST_LOSS)) ax0.scatter(TEST_N_FLT, TEST_LOSS, c=c) ax0.set_title("hidden layer size") ax1.scatter(np.log(TEST_LR), TEST_LOSS, c=c) ax1.set_title("learnrate") im = ax2.scatter([{"tanh":0, "rectified_linear":1}[n] for n in TEST_NONLIN], TEST_LOSS, c=c) ax2.set_title("tanh/relu") im = ax3.scatter(TEST_DROPOUT, TEST_LOSS, c=c) ax3.set_title("dropout") fig.colorbar(im) plt.show()
def slm_visitor_lfw_partial( max_n_per_class, maybe_test_view2=False, # -- this still takes too much memory assume_promising=False, foobar_trace_target=None, ): # -- this curries and re-decorates hpconvnet.lfw.slm_visitor_lfw # so that we can pass it to fmin() if max_n_per_class is not None: max_n_per_class = int(max_n_per_class) return hyperopt.partial( hpconvnet.lfw.slm_visitor_lfw, max_n_per_class=max_n_per_class, maybe_test_view2=maybe_test_view2, assume_promising=assume_promising, foobar_trace_target=foobar_trace_target, )
def small_random_run(): # -- This is a smoke test to make sure that a lot of code paths actually # run. Some of the jobs will fail, some should succeed, the data will be # loaded and some SVMs will be fit etc. Classifier performance is expected # to be poor (70% error?), because we're using just 10% of the data and # only trying a few random architectures. # # Expected running time on CPU: ~10 mins search_space = hpconvnet.cifar10.build_search_space( max_n_features=4500, # -- smaller than normal bagging_fraction=0.5, # -- normal n_unsup=2000, # -- smaller than normal abort_on_rows_larger_than=50 * 1000, # -- smaller ) trials = Trials() hyperopt.fmin( fn=hyperopt.partial(hpconvnet.cifar10.uslm_eval, data_fraction=0.1), # -- smaller than normal space=search_space, algo=hyperopt.rand.suggest, max_evals=10, trials=trials, )
def ks_best(self,space): space_4model = self.f_spacehp(space) test4best = fmin(self.ks,space_4model,algo=partial(tpe.suggest,n_startup_jobs=1),max_evals=100,trials=self.trials) best_params = self.f_bestparam(test4best,space) best_model = self.model(**best_params,random_state=self.random_state) best_model.fit(self.data_sets['train_x'], self.data_sets['train_y']) try: train_ks = ks_results(self.data_sets['train_x'], self.data_sets['train_y'], best_model) except: train_ks = None try: test_ks = ks_results(self.data_sets['test_x'], self.data_sets['test_y'], best_model) except: test_ks = None try: future_ks = ks_results(self.data_sets['data_future'], self.data_sets['target_future'], best_model) except: future_ks = None try: train_auc = auc_results(self.data_sets['train_x'], self.data_sets['train_y'], best_model) except: train_auc = None try: test_auc = auc_results(self.data_sets['test_x'], self.data_sets['test_y'], best_model) except: test_auc = None try: future_auc = auc_results(self.data_sets['data_future'], self.data_sets['target_future'], best_model) except: future_auc = None best_results = {'train_auc': train_auc,'test_auc': test_auc, 'future_auc': future_auc,'train_ks': train_ks, 'test_ks': test_ks, 'future_ks': future_ks} return best_params, best_model, best_results
def tune_hyper_parameter(stock_name, df, config): # Get last 365 day for tuning df = df[-365:] bayer_max_evals = config.get('bayer_max_evals', 1000) param_grid = config.get('param_grid') # Hyperparameter grid bayes_trials = Trials() # Create the algorithm bayes_algo = tpe.suggest fmin_objective = partial(objective, df=df) bayes_best = fmin(fn=fmin_objective, space=param_grid, algo=bayes_algo, trials=bayes_trials, max_evals=bayer_max_evals) param_file_path = os.path.join(config['model_dir'], '%s_param.txt' % (stock_name)) with open(param_file_path, 'w') as outfile: json.dump(bayes_best, outfile)
def train_model(total_data,bus_data,user_data): # 提取label并删除其他多余信息 print_to_log('==========数据预处理==============') total_data['label'] = total_data['product_amount'].apply(lambda x:1 if x>=0 else 0) total_data.drop(['customer_id', 'login_name', 'product_amount', 'salary_score' , 'followStart', 'followEnd', 'content', 'post'], axis=1, inplace=True) total_data_po = total_data.loc[total_data['label']==1] total_data_ne = total_data.loc[total_data['label']==0] #控制负样本与正样本的比例为10:1 samples = total_data_po.shape[0]*5 total_data_ne_sample = total_data_ne.sample(samples) # 重新生成训练数据 total_data_sample = pd.concat((total_data_ne_sample,total_data_po),axis=0) total_data_sample = total_data_sample.sample(frac=1) # 打乱顺序 # 离散型变量,如果某一取值的数量超过了95%,则舍弃该变量 remove = set() miss_value = [] print_to_log('处理类别型变量') for k in category_columns: if k in total_data_sample.keys(): counts = total_data_sample[k].value_counts() total_num = sum(counts) for v in counts: if v/total_num>=0.95: print_to_log(k) print_to_log(counts) remove.add(k) else: miss_value.append(k) # 连续型变量,如果其下四分位数=中位数=上四分位数,则舍弃该变量 print_to_log('处理连续型变量') for k in continues_columns: if k in total_data_sample.keys(): total_data_sample[k] = total_data_sample[k].astype('float') button = total_data_sample[k].min() top = total_data_sample[k].max() if button == top: remove.add(k) print_to_log(k, 0) else: var = total_data_sample[k].apply(lambda x: (x - button) / (top - button)).var() if var < 0.003: remove.add(k) print_to_log(k,var) else: miss_value.append(k) # 去掉缺失率大于80%的变量 print_to_log('处理缺失率较大的变量') for k in total_data_sample.keys(): miss = total_data[k].isna().sum()/total_data.shape[0] if miss>0.8: remove.add(k) print_to_log(k,':',miss) # 去掉商机地区信息 remove.add('placeCode') remove.add('businessOperate') remove.add('businessStage') remove.add('businessStatus') # 保存剔除变量 user_remove = [] bus_remove = [] for k in remove: if k in user_data.keys(): user_remove.append(k) else: bus_remove.append(k) total_data_sample.drop(list(remove), axis=1, inplace=True) user_data.drop(user_remove, axis=1, inplace=True) bus_data.drop(bus_remove, axis=1, inplace=True) # 添加冷启动数据 def cold_start_decode(k): if k in continues_columns: return 0 elif k in category_columns: return 'None' elif k == 'customer_id': return -1 else: return 'None' cold_start = {} for k in user_data.keys(): cold_start[k] = cold_start_decode(k) user_data.append(cold_start,ignore_index=True) if len(miss_value)>0: print_to_log('数据字段缺失,请检查数据库',miss_value) #连续变量用0填充,类别型变量全部用‘None’填充 def fillna(df): for k in df.keys(): if k in continues_columns: df[k] = df[k].fillna(0) else: df[k] = df[k].fillna('None') fillna(total_data_sample) fillna(user_data) fillna(bus_data) continues_dict = {} category_dict = {} for k in bus_data.keys(): if k in continues_columns: continues_dict[k] = (bus_data[k].max(), bus_data[k].min()) elif k in category_columns: bus_data[k] = bus_data[k].apply(lambda x: str(x).split('.')[0]) category_dict[k] = list(bus_data[k].unique()) if 'None' not in category_dict[k]: category_dict[k].append('None') bus_data[k] = bus_data[k].apply(lambda x: category_dict[k].index(x)) for k in user_data.keys(): if k in continues_columns: continues_dict[k] = (user_data[k].max(), user_data[k].min()) elif k in category_columns: user_data[k] = user_data[k].apply(lambda x: str(x).split('.')[0]) category_dict[k] = list(user_data[k].unique()) if 'None' not in category_dict[k]: category_dict[k].append('None') user_data[k] = user_data[k].apply(lambda x: category_dict[k].index(x)) # 对连续变量进行区间缩放并存放中间值,对类别型变量进行独热编码并存放中间值 for k in total_data_sample.keys(): # if k in continues_dict.keys(): # total_data_sample[k] = total_data_sample[k].apply(lambda x:(x-continues_dict[k][0])/(continues_dict[k][1])-continues_dict[k][0]) if k in category_dict.keys(): # 直接编码 total_data_sample[k] = total_data_sample[k].apply(lambda x:category_dict[k].index(str(x).split('.')[0])) # for v in category_dict[k]: # total_data_sample[k+'_'+str(v)] = total_data_sample[k].apply(lambda x:1 if x ==v else 0) # total_data_sample.drop(k,axis=1,inplace=True) # 建模并进行训练 label = total_data_sample['label'] total_data_sample.drop('label',axis = 1,inplace=True) lenth = total_data_sample.shape[0] train_data = total_data_sample[:int(0.8*lenth)] train_label = label[:int(0.8*lenth)] test_data = total_data_sample[int(0.8*lenth):] test_label = label[int(0.8*lenth):] print_to_log('==========模型训练==============') print(train_data.info()) print(train_data.head()) def objective(space): model = xgboost.XGBClassifier( max_depth=int(space['max_depth']), n_estimators=int(space['n_estimators']), subsample=space['subsample'], colsample_bytree=space['colsample_bytree'], learning_rate=space['learning_rate'], reg_alpha=space['reg_alpha'], nthread=4 ) model.fit(train_data, train_label) score = metrics.f1_score(test_label, model.predict(test_data)) print_to_log('score: {}'.format(score)) return {'loss': 1 - score, 'status': STATUS_OK} space = { 'max_depth': hp.quniform('max_depth', 2, 20, 1), 'n_estimators': hp.quniform('n_estimators', 100, 500, 1), 'subsample': hp.uniform('subsample', 0.8, 1), 'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1), 'learning_rate': hp.uniform('learning_rate', 0.01, 0.1), 'reg_alpha': hp.uniform('reg_alpha', 0.1, 1), } algo = partial(tpe.suggest, n_startup_jobs=4) trials = Trials() best = fmin(fn=objective, space=space, algo=algo, max_evals=20, trials=trials) print_to_log(best) model = xgboost.XGBClassifier( max_depth = int(best['max_depth']), n_estimators = int(best['n_estimators']), subsample = best['subsample'], colsample_bytree = best['colsample_bytree'], learning_rate = best['learning_rate'], reg_alpha = best['reg_alpha'], nthread = 4) model.fit(train_data,train_label) pred = model.predict(test_data) print_to_log('==========训练完成==============') print_to_log('模型得分: ') print_to_log('recall:',metrics.recall_score(test_label,pred)) print_to_log('precision:',metrics.precision_score(test_label,pred)) print_to_log('f1_score:',metrics.f1_score(test_label,pred)) print_to_log('auc_score:',metrics.roc_auc_score(test_label,pred)) print_to_log(len(total_data_sample.keys())) try: with open(model_path + 'remove.pk', 'wb') as f: pickle.dump(remove, f) with open(model_path + 'continues_dict.pk', 'wb') as f: pickle.dump(continues_dict, f) with open(model_path + 'category_dict.pk', 'wb') as f: pickle.dump(category_dict, f) joblib.dump(model, model_path+'model.m') user_data.to_csv(data_path + 'employee_feature.csv', encoding='utf-8', index=0, sep=',') bus_data.to_csv(data_path + 'business_feature.csv', encoding='utf-8', index=0, sep=',') print_to_log('所有数据保存完毕') except Exception as e: print_to_log(e, level=4) print_to_log('保存文件失败,检查文件路径')
# import os # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 # os.environ["CUDA_VISIBLE_DEVICES"] = "" from params_select import * from objective import * from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, partial, rand, space_eval from loss import weighted_categorical_crossentropy3 if __name__ == '__main__': params = {'ma': 5, 'std_window': 20, 'vol_window': 15} construct_feature_func = partial(construct_features1, params=params, test=False) data_set, reverse_func = get_data( file_name="E:\market_data/cs_market.csv", stks=zz500[:50], construct_feature_func=construct_feature_func, split_dates=["2016-01-01", "2017-01-01"]) performance_func = performance_factory(reverse_func, performance_types=[ 'Y0', 'Y', 'returns', 'cum_returns', 'annual_return', 'sharpe_ratio' ]) function = "test_weight" identity = str(uuid.uuid1()) namespace = function + '_' + identity
def best_model(self): algo = partial(tpe.suggest, n_startup_jobs=1) best = fmin(self.GBM, space=self.paras.hyper_opt, algo=algo, max_evals=20) print("best", best) return best
# shape 1 corresponds to (1, 256), 2 to (2, 128), and 4 to (4, 64) parameter_space = { 'shape': hp.choice('shape', [1, 2, 4]), 'units': hp.choice('units', [16, 32, 64, 128, 256, 512]), 'layers': hp.choice('layers', [1, 2, 3]), 'dense': hp.choice('dense', [32, 64, 128, 256, 512]) } trials = Trials() # number of models that will be built and evaluated using the provided choices max_evals = 225 algo = partial( tpe.suggest, n_EI_candidates=1000, gamma=0.2, n_startup_jobs=int(0.1 * max_evals), ) fmin(train_network, trials=trials, space=parameter_space, algo=algo, max_evals=max_evals, show_progressbar=False) df.to_csv('parameters.csv') best = get_best() print('\n-------------------------------------\n') print( 'Hyper-parameter space exploration ended. \nRetraining the best again on the full dataset.'
import sys if len(sys.argv) > 1: model_type = sys.argv[1] max_evals = int(sys.argv[2]) else: model_type = 'lgb' max_evals = 2 logger.debug( f'Try to search paras base on model:{model_type}, max_evals:{max_evals}' ) from functools import partial optimize_fun_ex = partial(optimize_fun, model_type=model_type) trials = Trials() space = get_search_space(model_type) best = fmin(optimize_fun_ex, space, algo=tpe.suggest, max_evals=max_evals, trials=trials) #logger.debug(f"Best: {best}") att_message = [ trials.trial_attachments(trial)['message'] for trial in trials.trials ] for score, para, misc in zip(
_counter += 1 trial_id = _counter trainer = AsymmetricSelfPlay(model_builder, model_params, env_params, eval_env_params, args.train_episodes, args.eval_episodes, args.num_evals, switch_freq, args.path + f'/{trial_id}', args.seed, args.processes) trainer.run() best_win_rate = -max(max(player_wr) for player_wr in trainer.win_rates) return best_win_rate algo = partial(tpe.suggest, n_startup_jobs=max(0, args.num_warmup_trials - trials_so_far)) best_param = fmin(wrapper, hyperparameter_space, algo=algo, max_evals=args.num_trials, trials=trials, rstate=random_state) save_run(trials, random_state, args.path) loss = [x['result']['loss'] for x in trials.trials] print("") print("##### Results")
def suggest(self): """Return the next parameter suggestion >>> import json >>> json_str='{"seed":0,"lib":"hyperopt","algo":"tpe","scope":{"x":["uniform",-10,10],"y":["uniform",-10,10]},' >>> json_str+='"max_evals":1,' >>> json_str+='"results":{"losses":[3.4620,3.192,28.963,19.64,20.458],' >>> json_str+='"statuses":["ok","ok","ok","ok","ok"],' >>> json_str+='"vals":{"y":[-0.16774,0.3122,-2.416,0.27455,-3.2827],' >>> json_str+='"x":[1.857,1.760,4.785,-4.498,2.837]}}}' >>> exec=ExecutorFactory.get_executor(json_str) >>> reval=json.loads(exec.suggest()) >>> reval["alog"] == 'tpe' True >>> reval["scope"]["x"][0] == 4.30378732744839 True >>> reval["scope"]["y"][0] == 0.9762700785464951 True """ _logger = getLogger(__name__) id_qnt = int(self.json_loaded[COMMON_MAXEVALS]) additional_args = [] executed_alog = self.json_loaded[COMMON_ALGO] if executed_alog == self.HYP_ALGO_TPE: algo = tpe.suggest id_qnt = 1 additional_args.append(tpe._default_prior_weight) additional_args.append(5) # n_startup_jobs elif executed_alog == self.HYP_ALGO_ANNEAL: algo = anneal.suggest elif executed_alog == self.HYP_ALGO_RAND: algo = rand.suggest elif executed_alog == self.HYP_ALGO_MIX: algo = partial(mix.suggest, p_suggest=[ (.1, rand.suggest), (.2, anneal.suggest), (.7, tpe.suggest), ]) else: _logger.warning('unknown algo define. use tpe') algo = tpe.suggest new_ids = self.trials.new_trial_ids(id_qnt) args = [new_ids, self.domain, self.trials, self.rand_seed ] + additional_args rval_docs = algo(*args) statuses = [] vals = {} for i in range(len(new_ids)): statuses.append( rval_docs[i][self.HYP_OUT_RESULT][self.HYP_OUT_STATUS]) vals = self.merge_dict_valuelist( vals, rval_docs[i][self.HYP_OUT_MISC][self.HYP_OUT_VALS]) results = dict(algo=executed_alog, statuses=statuses, vals=vals) return results
def train_model(self): if self.data_dir: load_dataset(self, self.data_dir) elif self.scale_down: make_new_dataset(self) elif self.scale_up: raise SystemExit( 'Please refer documentation. Requires you to prepare the dataset on your own and then use -d option.' ) else: load_dataset(self) # updating global variables. train_network only takes one and only one argument. global percent, block_size, scenario, gpu, output, verbose, new_model, no_of_classes percent = self.percent block_size = self.block_size scenario = self.scenario gpu = self.gpus output = self.output new_model = self.new_model if self.scale_down: no_of_classes = len(list(open(self.scale_down, 'r'))) if self.v: verbose = 0 elif self.vv: verbose = 1 elif self.vvv: verbose = 2 parameter_space = { 'layers': hp.choice('layers', [1, 2, 3]), 'embed_size': hp.choice('embed_size', [16, 32, 48, 64]), 'filter': hp.choice('filter', [16, 32, 64, 128]), 'kernel': hp.choice('kernel', [3, 11, 19, 27, 35]), 'pool': hp.choice('pool', [2, 4, 6, 8]), 'dense': hp.choice('dense', [16, 32, 64, 128, 256]) } trials = Trials() if self.algo.lower() == 'tpe': algo = partial( tpe.suggest, n_EI_candidates=1000, gamma=0.2, n_startup_jobs=int(0.1 * self.max_evals), ) elif self.algo.lower() == 'rand': algo = rand.suggest else: print( 'Warning! The requested hyper-parameter algorithm is not supported. Using TPE.' ) algo = partial( tpe.suggest, n_EI_candidates=1000, gamma=0.2, n_startup_jobs=int(0.1 * self.max_evals), ) fmin(train_network, trials=trials, space=parameter_space, algo=algo, max_evals=self.max_evals, show_progressbar=False) df.to_csv(os.path.join(self.output, 'parameters.csv')) best = get_best() print('\n-------------------------------------\n') print( 'Hyper-parameter space exploration ended. \nRetraining the best again on the full dataset.' ) percent = 1 train_network(best) print('The best model has been retrained and saved as {}.'.format( self.new_model))
# best : {'gamma': 0.4, 'learning_rate': 0.05740649534056902, 'max_depth': 5, 'min_child_weight': 6, 'n_estimators': 166, 'subsample': 0.6} # best param after transform : # {'gamma': 0.04000000000000001, 'learning_rate': 0.05114812990681138, 'max_depth': 10, 'min_child_weight': 7, 'n_estimators': 316, 'subsample': 0.56} # rmse of the best xgboost: 6136.126337046346 import matplotlib # Force matplotlib to not use any Xwindows backend. matplotlib.use('Agg') from hyperopt import fmin, tpe, hp, partial import numpy as np from sklearn.externals import joblib from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import mean_squared_error, zero_one_loss from sklearn.metrics import log_loss import xgboost as xgb import pandas as pd from xgboost import plot_importance from matplotlib import pyplot as plt totalcount = 50 Devide = "edge" #----------------------------------------------01---------------------------------------------------------- attribute = pd.read_csv(str(totalcount) + '-mix-two-data-1.csv') label = pd.read_csv(str(totalcount) + '-mix-two-label.csv') #print(waferA.info()) #print(waferL.info()) #mapping_type={'Center':0,'Donut':1,'Edge-Loc':2,'Edge-Ring':3,'Loc':4,'Random':5,'Scratch':6,'Near-full':7,'none':8} label = label['totalLatency'] attribute = attribute.loc[:, ~attribute.columns.str.contains('^Unnamed')] X = attribute y = label # ############################################################
def Bayesian_optimize_poly(df_minmax, force): ''' 贝叶斯优化具体教程见链接:https://github.com/FontTian/hyperopt-doc-zh/wiki/FMin :param df_minmax: 归一化后的DF数据 :param force: 径向力还是纵向力 :return: 返回多项式回归超参数筛选后的最优超参数 ''' space = {"param_degree": hp.randint("param_degree", 5, 15)} #贝叶斯优化的搜索域 ref_list = [ pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[0].name, pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[1].name, pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[2].name, pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[3].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[0].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[1].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[2].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[3].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[4].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[5].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[6].name ] def Polynomia_func(argsDict): model = Pipeline([ ('poly', PolynomialFeatures(degree=argsDict["param_degree"])), ('linear', LinearRegression()) ]) mse_list = [] for i in range(len(ref_list)): if i < 4: df1 = df_minmax[df_minmax['x/d'] != ref_list[i]] df2 = df_minmax[df_minmax['x/d'] == ref_list[i]] X_data = np.array(df1[['x/d', 'y/d']]) y_data = np.array(df1[[force]]).flatten() X_test = np.array(df2[['x/d', 'y/d']]) y_test = np.array(df2[[force]]).flatten() X_data = X_data.astype(np.float32) y_data = y_data.astype(np.float32) X_test = X_test.astype(np.float32) y_test = y_test.astype(np.float32) model.fit(X_data, y_data) y_test_predict = model.predict(X_test) mse = mean_squared_error(y_test, y_test_predict) mse_list.append(mse) if i >= 4: df1 = df_minmax[df_minmax['y/d'] != ref_list[i]] df2 = df_minmax[df_minmax['y/d'] == ref_list[i]] X_data = np.array(df1[['x/d', 'y/d']]) y_data = np.array(df1[[force]]).flatten() X_test = np.array(df2[['x/d', 'y/d']]) y_test = np.array(df2[[force]]).flatten() X_data = X_data.astype(np.float32) y_data = y_data.astype(np.float32) X_test = X_test.astype(np.float32) y_test = y_test.astype(np.float32) model.fit(X_data, y_data) y_test_predict = model.predict(X_test) mse = mean_squared_error(y_test, y_test_predict) mse_list.append(mse) # print(np.mean(mse_list),mse_list) return np.mean(mse_list) trials = Trials() algo = partial(tpe.suggest, n_startup_jobs=1) best = fmin(Polynomia_func, space, algo=algo, max_evals=100, trials=trials) return best
def Bayesian_optimize_nn(df_minmax, force): ''' :param df_minmax: 归一化后的DF数据 :param force: 径向力还是纵向力 :return: 返回神经网络模型超参数筛选后的最优超参数 ''' space = { 'units1': hp.choice('units1', [16, 64, 128, 320, 512]), 'units2': hp.choice('units2', [16, 64, 128, 320, 512]), 'units3': hp.choice('units3', [16, 64, 128, 320, 512]), 'lr': hp.choice('lr', [0.01, 0.001, 0.0001]), 'activation': hp.choice('activation', ['relu', 'sigmoid', 'tanh', 'linear']), 'loss': hp.choice('loss', [losses.logcosh, losses.mse, losses.mae, losses.mape]) } ref_list = [ pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[0].name, pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[1].name, pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[2].name, pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[3].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[0].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[1].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[2].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[3].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[4].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[5].name, pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[6].name ] def experiment(params): main_input = Input(shape=(2, ), name='main_input') x = Dense(params['units1'], activation=params['activation'])(main_input) x = Dense(params['units2'], activation=params['activation'])(x) x = Dense(params['units3'], activation=params['activation'])(x) output = Dense(1, activation="linear", name="out")(x) final_model = Model(inputs=[main_input], outputs=[output]) opt = Adam(lr=params['lr']) final_model.compile(optimizer=opt, loss=params['loss']) mse_list = [] for i in range(len(ref_list)): if i < 4: df1 = df_minmax[df_minmax['x/d'] != ref_list[i]] df2 = df_minmax[df_minmax['x/d'] == ref_list[i]] X_data = np.array(df1[['x/d', 'y/d']]) y_data = np.array(df1[[force]]).flatten() X_test = np.array(df2[['x/d', 'y/d']]) y_test = np.array(df2[[force]]).flatten() X_data = X_data.astype(np.float32) y_data = y_data.astype(np.float32) X_test = X_test.astype(np.float32) y_test = y_test.astype(np.float32) history = final_model.fit(X_data, y_data, epochs=30, batch_size=256, verbose=0, validation_data=(X_test, y_test), shuffle=True) y_test_predict = final_model.predict(X_test) mse = mean_squared_error(y_test, y_test_predict) mse_list.append(mse) if i >= 4: df1 = df_minmax[df_minmax['y/d'] != ref_list[i]] df2 = df_minmax[df_minmax['y/d'] == ref_list[i]] X_data = np.array(df1[['x/d', 'y/d']]) y_data = np.array(df1[[force]]).flatten() X_test = np.array(df2[['x/d', 'y/d']]) y_test = np.array(df2[[force]]).flatten() X_data = X_data.astype(np.float32) y_data = y_data.astype(np.float32) X_test = X_test.astype(np.float32) y_test = y_test.astype(np.float32) history = final_model.fit(X_data, y_data, epochs=30, batch_size=256, verbose=0, validation_data=(X_test, y_test), shuffle=True) y_test_predict = final_model.predict(X_test) mse = mean_squared_error(y_test, y_test_predict) mse_list.append(mse) mse = np.mean(mse_list) print('mse', mse) return mse algo = partial(tpe.suggest, n_startup_jobs=1) best = fmin(experiment, space, algo=algo, max_evals=200) return best
def pipeline(path): max_evals = 30 _, name, _, _ = tool.splitPath(path) logger.info(f'开始训练位点: {name}') print(f'开始训练位点: {name}') data = np.load(path) try: X, Y = data[:, :-1], data[:, -1] except: logger.info(f'位点: {name} 文件读取错误') print(f'位点: {name} 文件读取错误') return 0 if len(np.unique(Y)) == 1: logger.info(f'位点: {name} 只有一种类标签') print(f'位点: {name} 只有一种类标签') return 0 tmp = Y.tolist() tmp = dict(Counter(tmp)) if tmp[0] > tmp[1]: ma, mi = tmp[0], tmp[1] else: ma, mi = tmp[1], tmp[0] if mi / ma < 0.01: logger.info(f'位点: {name} 为低频位点') print(f'位点: {name} 为低频位点') return 0 space = { "num_leaves": hp.randint("num_leaves", 5), # [0, upper) "max_depth": hp.choice("max_depth", [-1, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]), "learning_rate": hp.uniform("learning_rate", 0.001, 2), # 0.001-2均匀分布 "n_estimators": hp.randint("n_estimators", 5), # [0,1000) "min_child_weight": hp.uniform("min_child_weight", 0.001, 0.01), # 0.001-2均匀分布 "min_child_samples": hp.randint("min_child_samples", 10), # [0,1000) "subsample": hp.randint("subsample", 4), "colsample_bytree": hp.choice("colsample_bytree", [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]), "reg_alpha": hp.choice("reg_alpha", [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1]), "reg_lambda": hp.choice("reg_lambda", [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100]), "path": hp.choice('path', [path]) } star = time.time() algo = partial(tpe.suggest, n_startup_jobs=1) # 优化算法种类 best = fmin(LGB, space, algo=algo, max_evals=max_evals) # max_evals表示想要训练的最大模型数量,越大越容易找到最优解 best = RECOVERLGB(best) TRAINLGB(X, Y, best, name, save_path + name + '.lgb', logger) end = time.time() times = end - star logger.info(f'位点: {name} 用时为: {times}')
'activation': hp.choice('activation', ['identity', 'logistic', 'tanh', 'relu']), 'solver': hp.choice('solver', ['lbfgs', 'sgd', 'adam']), 'batch_size': hp.uniform('batch_size', 1, 50), 'early_stopping': hp.choice('early_stopping', [True, False]), } space_SVM = { 'C': hp.uniform('C', 0.1, 50), 'kernel': hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']), 'degree': hp.uniform('degree', 1, 10), 'coef0': hp.uniform('coef0', 0, 10), } space_LR = { 'penalty': hp.choice('penalty', ['l1', 'l2']), 'C': hp.uniform('C', 0.1, 20), 'intercept_scaling': hp.randint('intercept_scaling', 100), 'solver': hp.choice('solver', ['liblinear', 'saga']), 'warm_start': hp.choice('warm_start', [True, False]), } algo = partial(tpe.suggest) trials = Trials() best = fmin(percept, space, algo=algo, max_evals=200, trials=trials) # print(best) print(space_eval(space, best)) print(percept(space_eval(space, best))) print("test")
def ks_fmin(self,space4model): test_best=fmin(self.ks,space4model,algo=partial(tpe.suggest,n_startup_jobs=1),max_evals=100,trials=self.trials) return test_best
def Hyperopt_get_best_parameters(Metrics='roc_auc', evals_num=30): from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, partial penalty_list = ['l1', 'l2'] parameter_space = { 'C': hp.uniform('C', 0, 1), 'penalty': hp.choice('penalty', penalty_list), } def hyperopt_train_test(params): clf = LogisticRegression(**params, random_state=123) auc = cross_val_score(clf, X_train, y_train, cv=5, scoring=Metrics).mean() # replace 2 return auc count = 0 def function(params): auc = hyperopt_train_test(params) global count count = count + 1 print({'loss': auc, 'status': STATUS_OK, 'count': count}) return -auc count = 0 def fuction_model(params): # print(params) folds = KFold(n_splits=5, shuffle=True, random_state=546789) train_preds = np.zeros(X_train.shape[0]) train_class = np.zeros(X_train.shape[0]) feats = [ f for f in X_train.columns if f not in ['Survived', 'PassengerId'] ] # 注意用户编号也要去掉 for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X_train)): trn_x, trn_y = X_train[feats].iloc[trn_idx], y_train.iloc[trn_idx] val_x, val_y = X_train[feats].iloc[val_idx], y_train.iloc[val_idx] clf = LogisticRegression(**params, random_state=123) clf.fit(trn_x, trn_y) train_preds[val_idx] = clf.predict_proba(val_x)[:, 1] train_class[val_idx] = clf.predict(val_x) del clf, trn_x, trn_y, val_x, val_y gc.collect() global count count = count + 1 if Metrics == 'roc_auc': score = roc_auc_score(y_train, train_preds) elif Metrics == 'accuracy': score = accuracy_score(y_train, train_class) elif Metrics == 'f1': score = f1_score(y_train, train_class) print("第%s次,%s score为:%f" % (str(count), Metrics, score)) return -score algo = partial(tpe.suggest, n_startup_jobs=20) trials = Trials() #max_evals -- 寻找最优参数的迭代的次数 best = fmin(fuction_model, parameter_space, algo=algo, max_evals=evals_num, trials=trials) #best["parameter"]返回的是数组下标,因此需要把它还原回来 best["penalty"] = penalty_list[best['penalty']] print('best:\n', best) clf = LogisticRegression(**best, random_state=123) phsorce = cross_val_score( clf, X_train, y_train, cv=5, scoring=Metrics).mean() # replace 4 roc_auc f1 accuracy print('贝叶斯优化参数得分:', phsorce) clf = LogisticRegression(random_state=123) nosorce = cross_val_score(clf, X_train, y_train, cv=5, scoring=Metrics).mean() # replace 5 print('自己调参数得分:', nosorce) return best
return score(pred, valid_y) def score(pred, y): ''' 给最后测试结果打分,根据不同的标准,这里需要每次都改 ''' metric = rmse(y, pred) print(metric) return metric if __name__ == '__main__': train_x, valid_x, train_y, valid_y = get_train_dataset() param_space_reg_skl_lasso = { 'alpha': hp.loguniform("alpha", numpy.log(0.00001), numpy.log(0.1)), 'random_state': skl_random_seed, "max_evals": lasso_max_evals, } best = fmin(objective, param_space_reg_skl_lasso, algo=partial(tpe.suggest, n_startup_jobs=1), max_evals=100, trials=Trials()) print(best) print(objective(best))
def lgbTraining(x_train, y_train, p): train_x, valid_x, train_y, valid_y = train_test_split(x_train.values, y_train.values, test_size=0.3, random_state=42) train = lgb.Dataset(train_x, train_y) valid = lgb.Dataset(valid_x, valid_y, reference=train) # 自定义hyperopt的参数空间 space = { "max_depth": hp.randint("max_depth", 15), "num_trees": hp.randint("num_trees", 20), 'learning_rate': hp.randint('learning_rate', 20), "num_leaves": hp.randint("num_leaves", 10), "lambda_l1": hp.randint("lambda_l1", 6) } def argsDict_tranform(argsDict, isPrint=False): argsDict["max_depth"] = argsDict["max_depth"] + 10 argsDict["num_trees"] = argsDict["num_trees"] * 5 + 100 argsDict["learning_rate"] = argsDict["learning_rate"] * 0.01 + 0.01 argsDict["num_leaves"] = argsDict["num_leaves"] * 3 + 10 argsDict["lambda_l1"] = argsDict["lambda_l1"] * 0.1 if isPrint: print(argsDict) else: pass return argsDict def lightgbm_factory(argsDict): argsDict = argsDict_tranform(argsDict) params = { 'nthread': -1, # 进程数 'max_depth': argsDict['max_depth'], # 最大深度 'num_trees': argsDict['num_trees'], # 树的数量 'learning_rate': argsDict['learning_rate'], # 学习率 'num_leaves': argsDict['num_leaves'], # 终点节点最小样本占比的和 'lambda_l1': argsDict["lambda_l1"], # L1 正则化 'lambda_l2': 0, # L2 正则化 'objective': 'regression', 'bagging_seed': 100 # 随机种子,light中默认为100 } params['metric'] = ['mae'] model_lgb = lgb.train(params, train, num_boost_round=20000, valid_sets=[valid], early_stopping_rounds=100) return get_transformer_score(model_lgb) # 获取实际功率大于0.03*p的部分 valid_y_new = valid_y[valid_y > 0.03 * p] valid_y_new_index = np.argwhere(valid_y > 0.03 * p) def get_transformer_score(transformer): model = transformer prediction = model.predict(valid_x, num_iteration=model.best_iteration) prediction_new = prediction[valid_y_new_index] return mean_absolute_error(valid_y_new, prediction_new) # 开始使用hyperopt进行自动调参 algo = partial(tpe.suggest, n_startup_jobs=1) best = fmin(lightgbm_factory, space, algo=algo, max_evals=100, pass_expr_memo_ctrl=None) MAE = lightgbm_factory(best) / p return MAE, best