X_train, y_train, _, _, _, _ = clean_data.fancy(shift=200)
    # X_train, y_train, _, _, _, _ = clean_data.one_hot_categorical(shift)
    print X_train.shape, y_train.shape

    previous_points = pd.read_csv('params/lgbt_params.csv')

    lgbtBO = BayesianOptimization(
        lgbt_evaluate,
        {
            'num_leaves': (100, 300),
            'min_data_in_leaf': (4, 20),
            'feature_fraction': (0.2, 1),
            'bagging_fraction': (0.9, 1),
            # 'bagging_freq': (90, 110)
        })

    lgbtBO.initialize_df(previous_points)

    lgbtBO.maximize(init_points=init_points, n_iter=num_iter)

    # Save results
    param_cache_path = 'params'
    try:
        os.mkdir(param_cache_path)
    except:
        pass

    file_name = 'params/lgbt_params_1.csv'
    lgbtBO.points_to_csv(file_name)
Beispiel #2
0
    def hyper_train(self,
                    num_epoch,
                    epoch_tail,
                    bo_num_iter,
                    bo_kappa,
                    bo_min_rand_num,
                    bo_results_filename,
                    synch_file_list=[],
                    sync_period=5):
        """
        A point of entry for multiple training procedure.

        Parameters:
        ----------
        num_epoch : int
            maximal number of training epochs
        epoch_tail : int
            number of epochs for overfitting detection
        bo_num_iter : int
            number of attempts for bayesian optimization
        bo_kappa : float
            kappa parameter for bayesian optimization
        bo_min_rand_num : int
            minimal number of random attempts for overfitting detection
        bo_results_filename : str
            name of file for results of bayesian optimization
        synch_file_list : str
            name of file for synchronization of several instances of hyper optimizers
        sync_period : int
            number of attempts between synchronizations of several instances of hyper optimizers
        """

        param_bounds = dict()
        param_bounds.update(self.model.param_bounds)
        param_bounds.update(self.optimizer.param_bounds)
        param_bounds.update(self.data_source.param_bounds)

        self.num_epoch = num_epoch
        self.epoch_tail = epoch_tail

        hyper_log_file_exist = os.path.exists(
            self.saver.hyper_log_filename) and (os.path.getsize(
                self.saver.hyper_log_filename) > 0)

        self.hyper_log_file = open(self.saver.hyper_log_filename, "a")
        bo = BayesianOptimization(self._hyper_train_target,
                                  param_bounds,
                                  verbose=1)
        bo_init_points = max(bo_min_rand_num, len(param_bounds.keys()))

        if hyper_log_file_exist:
            try:
                df = pd.read_csv(self.saver.hyper_log_filename, sep="\t")

                if df.isnull().values.any():
                    raise Exception()

                if len(df.index) > 0:
                    bo.initialize_df(df)

                    self.iter = df['iter'].max()
                    global_best_row_id = df['target'].idxmax()
                    self.global_best_value = df['target'].loc[
                        global_best_row_id]
                    self.global_best_iter = df['iter'].loc[global_best_row_id]

                    bo_init_points = max(0, bo_init_points - self.iter)
                else:
                    self._init_iter()
            except:
                raise Exception(
                    "Error: Hyperparameter optimization's file can't be properly read: {}"
                    .format(self.saver.hyper_log_filename))
        else:
            self._init_iter()
            self.hyper_log_file.write("\t".join(["iter", "target", "time"] +
                                                sorted(param_bounds)) + "\n")
            self.hyper_log_file.flush()

        try:
            if len(synch_file_list) == 0:
                bo.maximize(init_points=bo_init_points,
                            n_iter=bo_num_iter,
                            kappa=bo_kappa)
            else:
                if bo_init_points > 0:
                    bo.maximize(init_points=bo_init_points,
                                n_iter=0,
                                kappa=bo_kappa)
                for i in range(0, bo_num_iter, sync_period):
                    df = self._update_df(self.saver.hyper_log_filename,
                                         df=None)
                    for synch_filename in synch_file_list:
                        df = self._update_df(synch_filename, df)
                    bo = BayesianOptimization(self._hyper_train_target,
                                              param_bounds,
                                              verbose=1)
                    bo.initialize_df(df)
                    bo.maximize(init_points=0,
                                n_iter=sync_period,
                                kappa=bo_kappa)
        except KeyboardInterrupt:
            print("KeyboardInterrupt: saving...")
        bo.points_to_csv(
            os.path.join(self.saver.project_dirname, bo_results_filename))

        self.hyper_log_file.close()

        print(bo.res['max'])
        print("Results: global_best_value={}, global_best_iter={}".format(
            self.global_best_value, self.global_best_iter))