コード例 #1
0
    def retrain(self, model_bytes_arr, train_df, train_y_df, test_df,
                test_y_df):
        models = []
        for model_bytes in model_bytes_arr:
            models.append(load_model_pickle(model_bytes)['model'])

        model = sum_models(models)
        train_metrics = predict_and_score(model, train_df, train_y_df,
                                          'Probability')
        test_metrics = predict_and_score(model, test_df, test_y_df,
                                         'Probability')

        ret = {
            "train": train_metrics,
            "test": test_metrics,
            "model": save_model_pickle(model, list(train_df.columns))
        }

        return ret
コード例 #2
0
    def fit(self,
            df,
            evals=None,
            early_stopping_rounds=None,
            verbose_eval=None,
            plot=False,
            progress=None,
            **kwargs):
        '''Fit the CatBoostModel model given a DataFrame.
        This method accepts all key word arguments for the catboost.train method.

        :param df: A vaex DataFrame containing the features and target on which to train the model.
        :param evals: A list of DataFrames to be evaluated during training.
            This allows user to watch performance on the validation sets.
        :param int early_stopping_rounds: Activates early stopping.
        :param bool verbose_eval: Requires at least one item in *evals*.
            If *verbose_eval* is True then the evaluation metric on the validation set is printed at each boosting stage.
        :param bool plot: if True, display an interactive widget in the Jupyter
            notebook of how the train and validation sets score on each boosting iteration.
        :param progress: If True display a progressbar when the training is done in batches.
        '''
        self.pool_params['feature_names'] = self.features
        if evals is not None:
            for i, item in enumerate(evals):
                data = item[self.features].values
                target_data = item[self.target].to_numpy()
                evals[i] = catboost.Pool(data=data,
                                         label=target_data,
                                         **self.pool_params)

        # This does the actual training/fitting of the catboost model
        if self.batch_size is None:
            data = df[self.features].values
            target_data = df[self.target].to_numpy()
            dtrain = catboost.Pool(data=data,
                                   label=target_data,
                                   **self.pool_params)
            model = catboost.train(params=self.params,
                                   dtrain=dtrain,
                                   num_boost_round=self.num_boost_round,
                                   evals=evals,
                                   early_stopping_rounds=early_stopping_rounds,
                                   verbose_eval=verbose_eval,
                                   plot=plot,
                                   **kwargs)
            self.booster = model
            self.evals_result_ = [model.evals_result_]
            self.feature_importances_ = list(model.feature_importances_)
        else:
            models = []

            # Set up progressbar
            n_samples = len(df)
            progressbar = vaex.utils.progressbars(progress)

            column_names = self.features + [self.target]
            iterator = df[column_names].to_pandas_df(
                chunk_size=self.batch_size)
            for i1, i2, chunk in iterator:
                progressbar(i1 / n_samples)
                data = chunk[self.features].values
                target_data = chunk[self.target].values
                dtrain = catboost.Pool(data=data,
                                       label=target_data,
                                       **self.pool_params)
                model = catboost.train(
                    params=self.params,
                    dtrain=dtrain,
                    num_boost_round=self.num_boost_round,
                    evals=evals,
                    early_stopping_rounds=early_stopping_rounds,
                    verbose_eval=verbose_eval,
                    plot=plot,
                    **kwargs)
                self.evals_result_.append(model.evals_result_)
                models.append(model)
            progressbar(1.0)

            # Weights are key when summing models
            if len(self.batch_weights) == 0:
                batch_weights = [1 / len(models)] * len(models)
            elif self.batch_weights is not None and len(
                    self.batch_weights) != len(models):
                raise ValueError(
                    "'batch_weights' must be te same length as the number of models."
                )
            else:
                batch_weights = self.batch_weights

            # Sum the models
            self.booster = catboost.sum_models(
                models,
                weights=batch_weights,
                ctr_merge_policy=self.ctr_merge_policy)
コード例 #3
0
    # Enforce this as a monotonicity constraint to reduce overfitting & improve interpretability.
    # This seems to reduce accuracy, but I feel it's a necessary constraint unless
    # we can find an explanation for why a worse score could increase the likelihood
    # of an annotation.
    'monotone_constraints':
    {i: 0 if '_fdr' in f else 1
     for i, f in enumerate(features)},
    'verbose': True,
    # 'task_type': 'GPU',
}

#%% Evaluate with cross-validation if desired
splits = get_cv_splits(metrics_df.ds_id.unique(), 5)
results = cv_train(train_metrics_df, splits, features, cb_params)
# Sum to make an ensemble model - sometimes it's interesting for debugging
ens_model = sum_models(results.model.to_list())
#%% Make final model from all data
final_params = {
    **cb_params,
    'iterations': 1000,
    'loss_function':
    'PairLogit:max_pairs=10000',  # Reduce max_pairs if CatBoost complains
    'use_best_model': False,  # Must be disabled when eval set is None
    # CatBoost quantizes all inputs into bins, and border_count determines their granularity.
    # 254 is the default, higher gives slightly better accuracy but is slower to train
    'border_count': 1024,
}
final_model = train_catboost_model(metrics_df, metrics_df.ds_id.unique(), None,
                                   features, final_params)

コード例 #4
0
 def append_gbdt_model(self, new_gbdt_model, weights):
     if self.gbdt_model is None:
         return new_gbdt_model
     return sum_models([self.gbdt_model, new_gbdt_model], weights=weights)