X_train, y_train, _, _, _, _ = clean_data.fancy(shift=200) # X_train, y_train, _, _, _, _ = clean_data.one_hot_categorical(shift) print X_train.shape, y_train.shape previous_points = pd.read_csv('params/lgbt_params.csv') lgbtBO = BayesianOptimization( lgbt_evaluate, { 'num_leaves': (100, 300), 'min_data_in_leaf': (4, 20), 'feature_fraction': (0.2, 1), 'bagging_fraction': (0.9, 1), # 'bagging_freq': (90, 110) }) lgbtBO.initialize_df(previous_points) lgbtBO.maximize(init_points=init_points, n_iter=num_iter) # Save results param_cache_path = 'params' try: os.mkdir(param_cache_path) except: pass file_name = 'params/lgbt_params_1.csv' lgbtBO.points_to_csv(file_name)
def hyper_train(self, num_epoch, epoch_tail, bo_num_iter, bo_kappa, bo_min_rand_num, bo_results_filename, synch_file_list=[], sync_period=5): """ A point of entry for multiple training procedure. Parameters: ---------- num_epoch : int maximal number of training epochs epoch_tail : int number of epochs for overfitting detection bo_num_iter : int number of attempts for bayesian optimization bo_kappa : float kappa parameter for bayesian optimization bo_min_rand_num : int minimal number of random attempts for overfitting detection bo_results_filename : str name of file for results of bayesian optimization synch_file_list : str name of file for synchronization of several instances of hyper optimizers sync_period : int number of attempts between synchronizations of several instances of hyper optimizers """ param_bounds = dict() param_bounds.update(self.model.param_bounds) param_bounds.update(self.optimizer.param_bounds) param_bounds.update(self.data_source.param_bounds) self.num_epoch = num_epoch self.epoch_tail = epoch_tail hyper_log_file_exist = os.path.exists( self.saver.hyper_log_filename) and (os.path.getsize( self.saver.hyper_log_filename) > 0) self.hyper_log_file = open(self.saver.hyper_log_filename, "a") bo = BayesianOptimization(self._hyper_train_target, param_bounds, verbose=1) bo_init_points = max(bo_min_rand_num, len(param_bounds.keys())) if hyper_log_file_exist: try: df = pd.read_csv(self.saver.hyper_log_filename, sep="\t") if df.isnull().values.any(): raise Exception() if len(df.index) > 0: bo.initialize_df(df) self.iter = df['iter'].max() global_best_row_id = df['target'].idxmax() self.global_best_value = df['target'].loc[ global_best_row_id] self.global_best_iter = df['iter'].loc[global_best_row_id] bo_init_points = max(0, bo_init_points - self.iter) else: self._init_iter() except: raise Exception( "Error: Hyperparameter optimization's file can't be properly read: {}" .format(self.saver.hyper_log_filename)) else: self._init_iter() self.hyper_log_file.write("\t".join(["iter", "target", "time"] + sorted(param_bounds)) + "\n") self.hyper_log_file.flush() try: if len(synch_file_list) == 0: bo.maximize(init_points=bo_init_points, n_iter=bo_num_iter, kappa=bo_kappa) else: if bo_init_points > 0: bo.maximize(init_points=bo_init_points, n_iter=0, kappa=bo_kappa) for i in range(0, bo_num_iter, sync_period): df = self._update_df(self.saver.hyper_log_filename, df=None) for synch_filename in synch_file_list: df = self._update_df(synch_filename, df) bo = BayesianOptimization(self._hyper_train_target, param_bounds, verbose=1) bo.initialize_df(df) bo.maximize(init_points=0, n_iter=sync_period, kappa=bo_kappa) except KeyboardInterrupt: print("KeyboardInterrupt: saving...") bo.points_to_csv( os.path.join(self.saver.project_dirname, bo_results_filename)) self.hyper_log_file.close() print(bo.res['max']) print("Results: global_best_value={}, global_best_iter={}".format( self.global_best_value, self.global_best_iter))