def fit(self): t0 = u.t() self.fitted = False np.random.seed(self.random_state) if self.n_jobs_ != 1: pool = multiprocessing.Pool(self.n_jobs_) try: iter_params = [] for it in self.yield_cv(): iter_params.append(it) res = pool.map(_fit, iter_params) for iCV, i_model, i_preprocessor in res: self.models[iCV] = i_model self.preprocessors[iCV] = i_preprocessor finally: pool.close() else: for args in self.yield_cv(): icv, i_model, i_preprocessor = KFoldPredBase.fit_static(args) self.models[icv] = i_model self.preprocessors[icv] = i_preprocessor self.fitted = True if self.verbose: logd("Fit ALL CVs in %2.2f seconds" % u.td(t0)) return self
def fit(self, external_cols=None): self.predictors = {} self.predictions_cv = pd.DataFrame() for i_model, model in enumerate(self.models): model_name = stringify2(model, i_model) if self.verbose: logd(model_name) t0 = u.t() i_predictor = self.base_predictor(self.X, self.y, model, scoring=self.scoring, n_folds=self.n_folds, random_state=self.random_state, shuffle=self.shuffle, n_jobs=self.predictors_n_jobs, preprocessor=self.preprocessor, verbose=self.verbose) col = model_name i_predictor.fit() i_prediction_cv = i_predictor.predict() if not len(self.predictions_cv): self.predictions_cv = i_prediction_cv.rename(columns={i_predictor.cv_col: col}) # [i_predictor.cv_col] else: df = i_prediction_cv[[i_predictor.cv_col]].rename(columns={i_predictor.cv_col: col}) # TODO assert index is not duplicate self.predictions_cv = self.predictions_cv.merge(df, left_index=True, right_index=True) i_predictor.fit_test() self.predictors[model_name] = i_predictor if self.verbose: logd("Fit %s in %2.2f seconds" % (model_name, u.td(t0))) self.fit_ensemble(external_cols=external_cols)
def fit_test(self): t0 = u.t() self.fitted_test = False x = self.preprocessor.fit_transform(self.X) if self.preprocessor else self.X self.model.fit(x, self.y) self.fitted_test = True if self.verbose: logd("Fit Test in %2.2f seconds | %s" % (u.td(t0), x.shape))
def fit_ensemble(self, external_cols=None): t0 = u.t() _x = self.predictions_cv[self.cols] if self.predictions_cv is not None else pd.DataFrame() if external_cols is not None: if not isinstance(external_cols, pd.DataFrame): external_cols = pd.DataFrame(external_cols) for col in external_cols.columns: _x["ADD_%s" % col] = external_cols[col] _y = self.predictions_cv[self.true_col] self.ensemble_scaler = StandardScaler() x = self.ensemble_scaler.fit_transform(_x) if self.ensemble_grid_params: scorer = make_scorer(self.scoring, greater_is_better=self.score_greater_is_better) self.ensemble_model, _ = \ u.get_best_model(self.ensemble_model, self.ensemble_grid_params, x, _y, scoring=scorer, cv=self.n_folds, refit=True) else: self.ensemble_model.fit(x, _y) if self.verbose: logd("Fit Ensemble in %2.2f seconds" % u.td(t0)) self.predictions_cv["ENS"] = self.ensemble_model.predict(x) self.predictions_cv = self.predictions_cv[self.cols + ["ENS", self.true_col]]