def retrain(self, model_bytes_arr, train_df, train_y_df, test_df, test_y_df): models = [] for model_bytes in model_bytes_arr: models.append(load_model_pickle(model_bytes)['model']) model = sum_models(models) train_metrics = predict_and_score(model, train_df, train_y_df, 'Probability') test_metrics = predict_and_score(model, test_df, test_y_df, 'Probability') ret = { "train": train_metrics, "test": test_metrics, "model": save_model_pickle(model, list(train_df.columns)) } return ret
def fit(self, df, evals=None, early_stopping_rounds=None, verbose_eval=None, plot=False, progress=None, **kwargs): '''Fit the CatBoostModel model given a DataFrame. This method accepts all key word arguments for the catboost.train method. :param df: A vaex DataFrame containing the features and target on which to train the model. :param evals: A list of DataFrames to be evaluated during training. This allows user to watch performance on the validation sets. :param int early_stopping_rounds: Activates early stopping. :param bool verbose_eval: Requires at least one item in *evals*. If *verbose_eval* is True then the evaluation metric on the validation set is printed at each boosting stage. :param bool plot: if True, display an interactive widget in the Jupyter notebook of how the train and validation sets score on each boosting iteration. :param progress: If True display a progressbar when the training is done in batches. ''' self.pool_params['feature_names'] = self.features if evals is not None: for i, item in enumerate(evals): data = item[self.features].values target_data = item[self.target].to_numpy() evals[i] = catboost.Pool(data=data, label=target_data, **self.pool_params) # This does the actual training/fitting of the catboost model if self.batch_size is None: data = df[self.features].values target_data = df[self.target].to_numpy() dtrain = catboost.Pool(data=data, label=target_data, **self.pool_params) model = catboost.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval, plot=plot, **kwargs) self.booster = model self.evals_result_ = [model.evals_result_] self.feature_importances_ = list(model.feature_importances_) else: models = [] # Set up progressbar n_samples = len(df) progressbar = vaex.utils.progressbars(progress) column_names = self.features + [self.target] iterator = df[column_names].to_pandas_df( chunk_size=self.batch_size) for i1, i2, chunk in iterator: progressbar(i1 / n_samples) data = chunk[self.features].values target_data = chunk[self.target].values dtrain = catboost.Pool(data=data, label=target_data, **self.pool_params) model = catboost.train( params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval, plot=plot, **kwargs) self.evals_result_.append(model.evals_result_) models.append(model) progressbar(1.0) # Weights are key when summing models if len(self.batch_weights) == 0: batch_weights = [1 / len(models)] * len(models) elif self.batch_weights is not None and len( self.batch_weights) != len(models): raise ValueError( "'batch_weights' must be te same length as the number of models." ) else: batch_weights = self.batch_weights # Sum the models self.booster = catboost.sum_models( models, weights=batch_weights, ctr_merge_policy=self.ctr_merge_policy)
# Enforce this as a monotonicity constraint to reduce overfitting & improve interpretability. # This seems to reduce accuracy, but I feel it's a necessary constraint unless # we can find an explanation for why a worse score could increase the likelihood # of an annotation. 'monotone_constraints': {i: 0 if '_fdr' in f else 1 for i, f in enumerate(features)}, 'verbose': True, # 'task_type': 'GPU', } #%% Evaluate with cross-validation if desired splits = get_cv_splits(metrics_df.ds_id.unique(), 5) results = cv_train(train_metrics_df, splits, features, cb_params) # Sum to make an ensemble model - sometimes it's interesting for debugging ens_model = sum_models(results.model.to_list()) #%% Make final model from all data final_params = { **cb_params, 'iterations': 1000, 'loss_function': 'PairLogit:max_pairs=10000', # Reduce max_pairs if CatBoost complains 'use_best_model': False, # Must be disabled when eval set is None # CatBoost quantizes all inputs into bins, and border_count determines their granularity. # 254 is the default, higher gives slightly better accuracy but is slower to train 'border_count': 1024, } final_model = train_catboost_model(metrics_df, metrics_df.ds_id.unique(), None, features, final_params)
def append_gbdt_model(self, new_gbdt_model, weights): if self.gbdt_model is None: return new_gbdt_model return sum_models([self.gbdt_model, new_gbdt_model], weights=weights)