def generate_standard_deviation_data( self, model: RegressorMixin) -> RegressorMixin: # Define some variables predicted = None self.standard_deviation = pd.DataFrame() # Loop over horizons and ask prediction for each specific horizon for horizon in self.validation_data.horizon.unique(): # Make subset for this specific horizon sub_val = self.validation_data[self.validation_data.horizon == horizon] try: predicted = model.predict(sub_val.iloc[:, 1:-1]) except Exception as e: print("Could not get prediction from new model!", e) # Calculate confidence interval for this horizon confidence_interval_horizon = self._calculate_standard_deviation( sub_val.iloc[:, 0], predicted) confidence_interval_horizon[ "horizon"] = horizon # Label with respective horizon self.standard_deviation = pd.concat( [self.standard_deviation, confidence_interval_horizon]) model.standard_deviation = self.standard_deviation return model
def _fit_and_predict_oof_model( self, estimator: RegressorMixin, X: ArrayLike, y: ArrayLike, train_index: ArrayLike, val_index: ArrayLike, sample_weight: Optional[ArrayLike] = None, ) -> Tuple[RegressorMixin, NDArray, ArrayLike]: """ Fit a single out-of-fold model on a given training set and perform predictions on a test set. Parameters ---------- estimator : RegressorMixin Estimator to train. X : ArrayLike of shape (n_samples, n_features) Input data. y : ArrayLike of shape (n_samples,) Input labels. train_index : ArrayLike of shape (n_samples_train) Training data indices. val_index : ArrayLike of shape (n_samples_val) Validation data indices. sample_weight : Optional[ArrayLike] of shape (n_samples,) Sample weights. If None, then samples are equally weighted. By default ``None``. Returns ------- Tuple[RegressorMixin, NDArray, ArrayLike] - [0]: RegressorMixin, fitted estimator - [1]: NDArray of shape (n_samples_val,), estimator predictions on the validation fold. - [3]: ArrayLike of shape (n_samples_val,), validation data indices. """ X_train = _safe_indexing(X, train_index) y_train = _safe_indexing(y, train_index) X_val = _safe_indexing(X, val_index) if sample_weight is None: estimator = fit_estimator(estimator, X_train, y_train) else: sample_weight_train = _safe_indexing(sample_weight, train_index) estimator = fit_estimator( estimator, X_train, y_train, sample_weight_train ) if _num_samples(X_val) > 0: y_pred = estimator.predict(X_val) else: y_pred = np.array([]) return estimator, y_pred, val_index
def produce_submission(model: RegressorMixin): td = load_test_data() out: pd.DataFrame = model.predict(td) submission_data = pd.DataFrame([td.index, out]).T submission_data.columns = ['Id', 'SalePrice'] submission_data = submission_data.astype({'Id': int, 'SalePrice': float}) submission_data['SalePrice'].round(decimals=2) submission_data.to_csv('../data/submissions/nearest-neighbors.csv', header=['Id', 'SalePrice'], index=False)
def bootstrap_regressor( f: RegressorMixin, X, # numpy array y, # numpy array num_samples: int = 100, random_state: int = random.randint(0, 2 ** 32 - 1), ) -> List[float]: """ Take the regressor f, and compute it's bootstrapped accuracy over the dataset `X`,`y`. Generate `num_samples` samples; and seed the resampler with `random_state`. """ dist: List[float] = [] y_pred = f.predict(X) # type:ignore # do the bootstrap: for trial in range(num_samples): sample_pred, sample_truth = resample( y_pred, y, random_state=trial + random_state ) # type:ignore score = mean_squared_error(y_true=sample_truth, y_pred=sample_pred) # type:ignore dist.append(score) return dist
def bootstrap_mae( f: RegressorMixin, X, # numpy array y, # numpy array num_samples: int = 100, random_state: int = random.randint(0, 2 ** 32 - 1), ) -> List[float]: """ Take the regressor ``f``, and compute it's bootstrapped mse over the dataset ``X``,``y``. Generate ``num_samples`` samples; and seed the resampler with ``random_state``. """ dist: List[float] = [] y_pred = f.predict(X) # type:ignore (predict not on ClassifierMixin) # do the bootstrap: for trial in range(num_samples): sample_pred, sample_truth = resample( y_pred, y, random_state=trial + random_state ) # type:ignore score = mean_absolute_error(y_true=sample_truth, y_pred=sample_pred) # type:ignore dist.append(score) return dist