Esempio n. 1
0
def xgb_randomcv(reg,
                 params,
                 x_train,
                 y_train,
                 x_test,
                 n_iters=10,
                 cv=3,
                 random_state=0):
    np.random.seed(random_state)
    seed1 = np.random.randint(10000)
    seed2 = np.random.randint(10000)
    param_list = list(model_selection.ParameterSampler(params, n_iters, seed1))
    y_test_pred_list = []
    y_train_pred_list = []
    mae_list = []
    ntree_list = []
    for p in param_list:
        for k, v in p.items():
            setattr(reg, k, v)
        y_test_pred_, y_train_pred_, mae_, ntree_ = \
            cv_predict_xgb(reg, x_train, y_train, x_test, cv, seed2)
        y_test_pred_list.append(y_test_pred_)
        y_train_pred_list.append(y_train_pred_)
        mae_list.append(mae_)
        ntree_list.append(ntree_)

    return y_test_pred_list, y_train_pred_list, mae_list, ntree_list, param_list
def xgb_randomcv(reg,
                 params,
                 x_train,
                 y_train,
                 x_test,
                 n_iters=10,
                 cv=3,
                 random_state=0):
    np.random.seed(random_state)
    seed1 = np.random.randint(10000)
    seed2 = np.random.randint(10000)
    param_list = list(model_selection.ParameterSampler(params, n_iters, seed1))
    y_test_pred_list = []
    y_train_pred_list = []
    obj_list = []
    ntree_list = []
    for p in param_list:
        reg.set_params(**p)
        y_test_pred_, y_train_pred_, obj_, ntree_ = \
            cv_predict_xgb(reg, x_train, y_train, x_test, cv, seed2)
        y_test_pred_list.append(y_test_pred_)
        y_train_pred_list.append(y_train_pred_)
        obj_list.append(obj_)
        ntree_list.append(ntree_)

    return y_test_pred_list, y_train_pred_list, obj_list, ntree_list, param_list
 def _get_param_iterator(self):
     """Return ParameterSampler instance for the given distributions"""
     return model_selection.ParameterSampler(self.param_distributions,
                                             self.n_iter,
                                             random_state=self.random_state)
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    train.drop(['id', 'loss'], axis=1, inplace=True)
    test.drop('id', axis=1, inplace=True)
    train_test = pd.concat((train, test))
    values = set()
    cols = list(train.columns)
    cols_cat = [i for i in cols if 'cat' in i]
    for c in cols_cat:
        values = list(np.unique(train_test[c].values))
        to_replace = {v: encode(v) for v in values}
        train_test[c].replace(to_replace, inplace=True)
        print(c, list(np.unique(train_test[c].values)))

    save_data('train_test_encoded.pkl', train_test)

    #%% parameter list
    params = {}
    params['max_depth'] = [9, 12, 15, 18, 21, 24]
    #params['max_depth'] = [1, 2]
    params['learning_rate'] = [0.01]
    params['subsample'] = [0.2, 0.3, 0.4, 0.5, 0.6, 0.8]
    params['colsample_bytree'] = [0.2, 0.4, 0.6, 0.8]
    params['min_child_weight'] = [1, 3, 5]
    #params['base_score'] = [1, 2, 4, 8]
    params['alpha'] = [1, 2, 5]
    params['gamma'] = [1, 3, 5, 10]

    parameter_list = list(model_selection.ParameterSampler(params, 100, 0))
    save_data('parameterList.pkl', parameter_list)
Esempio n. 5
0
    def do_search(self,
                  param_distributions,
                  record,
                  budget=100,
                  n_runs=3,
                  n_folds=5,
                  verbose=False):

        with tqdm.tqdm_notebook(total=budget * n_runs * n_folds) as progress:

            configs = list(
                model_selection.ParameterSampler(param_distributions, budget,
                                                 record.randomSeed))
            for c in configs:

                config = self.__get_rounded_config(c)

                score = record.getScore(config)

                # skip this set of parameters if we already have a record of it
                if (score is not None):
                    if verbose:
                        print(config, score)

                    progress.update(n_runs * n_folds)
                    continue

                scores = []

                for run in range(0, n_runs):

                    folds = self.dataset.build_folds(n_folds)

                    gold = []
                    pred = []

                    for train_index, test_index in folds:

                        X_train, X_test = self.dataset.items[
                            train_index], self.dataset.items[test_index]

                        if self.dataset.type == 'c':
                            y_train, y_test = self.dataset.labels[
                                train_index], self.dataset.labels[test_index]
                        else:
                            y_train, y_test = self.dataset.scores[
                                train_index], self.dataset.scores[test_index]

                        gold = np.concatenate((gold, y_test))

                        alg = self.initialize_algorithm(config)
                        alg.fit(X_train, y_train)

                        pred = np.concatenate((pred, alg.predict(X_test)))

                        progress.update(1)

                    scores.append(
                        helpers.get_score(gold, pred, self.scorer,
                                          self.scorer_config))

                score = np.mean(scores)
                record.update(config, score)

                if verbose:
                    print(config, score)

        return record.bestConfig, record.bestScore