Esempio n. 1
0
def with_params(pipeline, params):
    for param in params:
        pipeline.set_params(**{
            pipeline_utils.find_xgbmodel_param_prefix(pipeline)[0] + param: params[param]
        })
    print(pipeline_utils.get_final_estimator(pipeline).get_params())
    return pipeline
Esempio n. 2
0
    def fit(self, X, y):
        n_samples = X.shape[0]
        # if isinstance(self.hold_out_size, int):
        #     self.hold_out_size = self.hold_out_size / n_samples

        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.hold_out_size,
        #                                                     random_state=self.rng)
        # print(type(X))
        # self.hold_out_size = self.hold_out_size * n_samples
        # print(self.hold_out_size)

        X_train_idx, X_test_idx = train_test_split(
            X.index, test_size=self.hold_out_size, random_state=self.rng)

        self._best_its = []
        for union, estimator in deepcopy(self.union_estimator_tuples):
            # if not isinstance(estimator, xgb.XGBModel):
            #     raise ValueError('best_iteration_ can only be determined for XGBModel. Given {} instead.'
            #                      .format(estimator.__class__.__name__))

            xgb_estimator_path, _ = pipeline_utils.find_xgbmodel_param_prefix(
                estimator)
            # print(xgb_estimator_path)
            # raise SystemExit(1)
            # print(X_train.shape, X_test.shape)
            X_train, X_test = X.ix[X_train_idx], X.ix[X_test_idx]
            y_train, y_test = y.ix[X_train_idx], y.ix[X_test_idx]
            X_train = union.fit_transform(X_train, y_train)
            X_test = union.transform(X_test)
            eval_set = [(X_test, y_test)]

            fit_params = {
                xgb_estimator_path + 'eval_set': eval_set,
                xgb_estimator_path + 'early_stopping_rounds':
                self.early_stopping_rounds,
                xgb_estimator_path + 'eval_metric': self.eval_metric,
                xgb_estimator_path + 'maximize_score': self.maximize_score,
                xgb_estimator_path + 'verbose': self.verbose_eval
            }

            estimator.fit(X_train, y_train, **fit_params)
            # eval_set=eval_set,
            # early_stopping_rounds=self.early_stopping_rounds)
            if isinstance(estimator, Pipeline):
                final_ = pipeline_utils.get_final_estimator(estimator)
            else:
                final_ = estimator
            self._best_its.append(final_.best_iteration)
        return self
                              hold_out_size=0.25,
                              maximize_score=True,
                              eval_metric=xgb_normalized_gini,
                              random_state=2,
                              early_stopping_rounds=80)
    early_stop.fit(dataset, target)
    best_iteration = early_stop.best_iteration_[0]

    from kaggle_tools.utils import pipeline_utils
    pipeline = overall_pipeline()
    pipeline.set_params(
        **{
            pipeline_utils.find_xgbmodel_param_prefix(pipeline)[0] + 'n_estimators':
            best_iteration
        })
    print(pipeline_utils.get_final_estimator(pipeline).get_params())

    cv = KFold(len(target), n_folds=4, random_state=2, shuffle=False)
    submission = SqrtHazardSubmission(pipeline, 'XGB_Direct_Mean', cv=cv)

    submission.fit(dataset,
                   target,
                   perform_cv=True,
                   scoring=scorer_normalized_gini,
                   n_jobs=2,
                   verbose=3)

    print('fitted. time:', time.time() - before)

    original_test_set = pd.read_csv(settings.TEST_FILE)
    test_set = pd.DataFrame(data=catconversion.transform(original_test_set),