def generate_standard_deviation_data( self, model: RegressorMixin) -> RegressorMixin: # Define some variables predicted = None self.standard_deviation = pd.DataFrame() # Loop over horizons and ask prediction for each specific horizon for horizon in self.validation_data.horizon.unique(): # Make subset for this specific horizon sub_val = self.validation_data[self.validation_data.horizon == horizon] try: predicted = model.predict(sub_val.iloc[:, 1:-1]) except Exception as e: print("Could not get prediction from new model!", e) # Calculate confidence interval for this horizon confidence_interval_horizon = self._calculate_standard_deviation( sub_val.iloc[:, 0], predicted) confidence_interval_horizon[ "horizon"] = horizon # Label with respective horizon self.standard_deviation = pd.concat( [self.standard_deviation, confidence_interval_horizon]) model.standard_deviation = self.standard_deviation return model
def __init__(self, binner=None, estimator=None, n_jobs=None, verbose=False): """ @param binner transformer or predictor which creates the buckets @param estimator predictor trained on every bucket @param n_jobs number of parallel jobs (for training and predicting) @param verbose boolean or use ``'tqdm'`` to use :epkg:`tqdm` to fit the estimators *binner* allows the following values: - ``tree``: the model is :epkg:`sklearn:tree:DecisionTreeRegressor` - ``'bins'``: the model :epkg:`sklearn:preprocessing:KBinsDiscretizer` - any instanciated model *estimator* allows the following values: - ``None``: the model is :epkg:`sklearn:linear_model:LinearRegression` - any instanciated model """ if estimator is None: estimator = LinearRegression() if binner in ('tree', None): binner = DecisionTreeRegressor(min_samples_leaf=2) RegressorMixin.__init__(self) PiecewiseEstimator.__init__(self, binner=binner, estimator=estimator, n_jobs=n_jobs, verbose=verbose)
def __init__(self, num_inputs, mxseed=0, epochs=5000, net_type=1): BaseEstimator.__init__(self) RegressorMixin.__init__(self) self.net = None self.num_inputs = num_inputs self.mxseed = mxseed self.epochs = epochs self.net_type = net_type return
def valscore(self, Xn, yn, scoring): if scoring == 'weighted': return (RegressorMixin.score(self, Xn, yn, sample_weight=self.caseweights_)) elif scoring == 'normal': return (RegressorMixin.score(self, Xn, yn)) else: ValueError('Scoring flag must be set to "weighted" or "normal".')
def valscore(self,Xn,yn,scoring): n,p,Xn = _predict_check_input(Xn) (n,p) = Xn.shape if p!= self.X.shape[1]: raise(ValueError('New data must have seame number of columns as the ones the model has been trained with')) if scoring=='weighted': return(RegressorMixin.score(self,Xn,yn,sample_weight=self.caseweights_)) elif scoring=='normal': return(RegressorMixin.score(self,Xn,yn)) else: raise(ValueError('Scoring flag must be set to "weighted" or "normal".'))
def __init__(self, model='SIR', t=0, max_iter=100, learning_rate_init=0.1, lr_schedule='constant', momentum=0.9, power_t=0.5, early_th=None, min_threshold='auto', max_threshold='auto', verbose=False, init=None): if init is not None: if isinstance(init, EpidemicRegressor): if hasattr(init, 'coef_'): init = init.coef_.copy() else: init = None # pragma: no cover elif not isinstance(init, dict): raise TypeError( f"init must be a dictionary not {type(init)}.") BaseEstimator.__init__(self) RegressorMixin.__init__(self) self.t = t self.model = model self.max_iter = max_iter self.learning_rate_init = learning_rate_init self.lr_schedule = lr_schedule self.momentum = momentum self.power_t = power_t self.early_th = early_th self.verbose = verbose if min_threshold == 'auto': if model.upper() in ('SIR', 'SIRD'): min_threshold = 0.0001 elif model.upper() in ('SIRC', ): pmin = dict(beta=0.001, nu=0.0001, mu=0.0001, a=-1., b=0., c=0.) min_threshold = numpy.array( [pmin[k[0]] for k in CovidSIRDc.P0]) elif model.upper() in ('SIRDC'): pmin = dict(beta=0.001, nu=0.001, mu=0.001, a=-1., b=0., c=0.) min_threshold = numpy.array( [pmin[k[0]] for k in CovidSIRDc.P0]) if max_threshold == 'auto': if model.upper() in ('SIR', 'SIRD'): max_threshold = 1. elif model.upper() in ('SIRC', 'SIRDC'): pmax = dict(beta=1., nu=0.5, mu=0.5, a=0., b=4., c=2.) max_threshold = numpy.array( [pmax[k[0]] for k in CovidSIRDc.P0]) self.min_threshold = min_threshold self.max_threshold = max_threshold self._get_model() self.init = init if init is not None: self.coef_ = init
def __init__(self, force_positive=False, **kwargs): """ *kwargs* should contains parameters for :epkg:`sklearn:decomposition:NMF`. The parameter *force_positive* removes all negative predictions and replaces by zero. """ BaseEstimator.__init__(self) RegressorMixin.__init__(self) MultiOutputMixin.__init__(self) for k, v in kwargs.items(): setattr(self, k, v) self.force_positive = force_positive
def valscore(self,Xn,yn,scoring): if type(Xn) == ps.core.frame.DataFrame: Xn = Xn.to_numpy() if type(yn) in [ps.core.frame.DataFrame,ps.core.series.Series]: yn = yn.to_numpy().T.astype('float64') (n,p) = Xn.shape if p!= self.X.shape[1]: raise(ValueError('New data must have seame number of columns as the ones the model has been trained with')) if scoring=='weighted': return(RegressorMixin.score(self,Xn,yn,sample_weight=self.caseweights_)) elif scoring=='normal': return(RegressorMixin.score(self,Xn,yn)) else: raise(ValueError('Scoring flag must be set to "weighted" or "normal".'))
def _fit_and_predict_oof_model( self, estimator: RegressorMixin, X: ArrayLike, y: ArrayLike, train_index: ArrayLike, val_index: ArrayLike, sample_weight: Optional[ArrayLike] = None, ) -> Tuple[RegressorMixin, NDArray, ArrayLike]: """ Fit a single out-of-fold model on a given training set and perform predictions on a test set. Parameters ---------- estimator : RegressorMixin Estimator to train. X : ArrayLike of shape (n_samples, n_features) Input data. y : ArrayLike of shape (n_samples,) Input labels. train_index : ArrayLike of shape (n_samples_train) Training data indices. val_index : ArrayLike of shape (n_samples_val) Validation data indices. sample_weight : Optional[ArrayLike] of shape (n_samples,) Sample weights. If None, then samples are equally weighted. By default ``None``. Returns ------- Tuple[RegressorMixin, NDArray, ArrayLike] - [0]: RegressorMixin, fitted estimator - [1]: NDArray of shape (n_samples_val,), estimator predictions on the validation fold. - [3]: ArrayLike of shape (n_samples_val,), validation data indices. """ X_train = _safe_indexing(X, train_index) y_train = _safe_indexing(y, train_index) X_val = _safe_indexing(X, val_index) if sample_weight is None: estimator = fit_estimator(estimator, X_train, y_train) else: sample_weight_train = _safe_indexing(sample_weight, train_index) estimator = fit_estimator( estimator, X_train, y_train, sample_weight_train ) if _num_samples(X_val) > 0: y_pred = estimator.predict(X_val) else: y_pred = np.array([]) return estimator, y_pred, val_index
def __init__(self, rf_estimator=None, lasso_estimator=None): """ @param rf_estimator random forest estimator, :epkg:`sklearn:ensemble:RandomForestRegressor` by default @param lass_estimator Lasso estimator, :epkg:`sklearn:linear_model:LassoRegression` by default """ BaseEstimator.__init__(self) RegressorMixin.__init__(self) if rf_estimator is None: rf_estimator = RandomForestRegressor() if lasso_estimator is None: lasso_estimator = Lasso() self.rf_estimator = rf_estimator self.lasso_estimator = lasso_estimator
def produce_submission(model: RegressorMixin): td = load_test_data() out: pd.DataFrame = model.predict(td) submission_data = pd.DataFrame([td.index, out]).T submission_data.columns = ['Id', 'SalePrice'] submission_data = submission_data.astype({'Id': int, 'SalePrice': float}) submission_data['SalePrice'].round(decimals=2) submission_data.to_csv('../data/submissions/nearest-neighbors.csv', header=['Id', 'SalePrice'], index=False)
def _train(train_data: DataFrame, regressor: RegressorMixin, clusterer: Clustering, do_cv=False) -> dict: models = dict() train_data = clusterer.cluster_data(train_data) for cluster in range(clusterer.n_clusters): cluster_train_df = train_data[cluster] if not cluster_train_df.empty: cluster_targets_df = cluster_train_df['label'] if do_cv: cross_validation_result = cross_validate( regressor, cluster_train_df.drop('label', 1), cluster_targets_df.values.ravel(), return_estimator=True, cv=10 #TODO per Chiara check se vuoi 10 cv ) validation_scores = cross_validation_result['test_score'] regressors = cross_validation_result['estimator'] regressor = regressors[dict( zip(validation_scores, range(len(validation_scores))) )[max( validation_scores )]] #TODO per Chiara check se vuoi il max o min o quello che sta in mezzo else: regressor.fit(cluster_train_df.drop('label', 1), cluster_targets_df.values.ravel()) models[cluster] = regressor try: regressor = clone(regressor) except TypeError: regressor = clone(regressor, safe=False) return { ModelType.CLUSTERER.value: clusterer, ModelType.REGRESSOR.value: models }
def __init__(self, estimator=None, n_estimators=10, n_jobs=None, alpha=1., verbose=False): """ @param estimator predictor trained on every bucket @param n_estimators number of estimators to train @param n_jobs number of parallel jobs (for training and predicting) @param alpha proportion of samples resampled for each training @param verbose boolean or use ``'tqdm'`` to use :epkg:`tqdm` to fit the estimators """ BaseEstimator.__init__(self) RegressorMixin.__init__(self) if estimator is None: raise ValueError("estimator cannot be null.") self.estimator = estimator self.n_jobs = n_jobs self.alpha = alpha self.verbose = verbose self.n_estimators = n_estimators
def _train(train_data: DataFrame, regressor: RegressorMixin, clusterer: Clustering) -> dict: models = dict() train_data = clusterer.cluster_data(train_data) for cluster in range(clusterer.n_clusters): cluster_train_df = train_data[cluster] if not cluster_train_df.empty: cluster_targets_df = cluster_train_df['label'] regressor.fit(cluster_train_df.drop('label', 1), cluster_targets_df.values.ravel()) models[cluster] = regressor try: regressor = clone(regressor) except TypeError: regressor = clone(regressor, safe=False) return {'clusterer': clusterer, PredictiveModels.REGRESSION.value: models}
def _cv_estimate(model: RegressorMixin, train_data: pd.DataFrame, features: List[str], y: str, n_splits: int) -> Tuple[pd.Series, List[RegressorMixin]]: cv = KFold(n_splits=n_splits) models = [] cv_pred = pd.Series(np.nan, index=train_data.index) for train, test in cv.split(train_data): m = model.fit(train_data[features].iloc[train], train_data[y].iloc[train]) cv_pred.iloc[test] = m.predict(train_data[features].iloc[test]) models += [m] return cv_pred, models
def bootstrap_mae( f: RegressorMixin, X, # numpy array y, # numpy array num_samples: int = 100, random_state: int = random.randint(0, 2 ** 32 - 1), ) -> List[float]: """ Take the regressor ``f``, and compute it's bootstrapped mse over the dataset ``X``,``y``. Generate ``num_samples`` samples; and seed the resampler with ``random_state``. """ dist: List[float] = [] y_pred = f.predict(X) # type:ignore (predict not on ClassifierMixin) # do the bootstrap: for trial in range(num_samples): sample_pred, sample_truth = resample( y_pred, y, random_state=trial + random_state ) # type:ignore score = mean_absolute_error(y_true=sample_truth, y_pred=sample_pred) # type:ignore dist.append(score) return dist
def bootstrap_regressor( f: RegressorMixin, X, # numpy array y, # numpy array num_samples: int = 100, random_state: int = random.randint(0, 2 ** 32 - 1), ) -> List[float]: """ Take the regressor f, and compute it's bootstrapped accuracy over the dataset `X`,`y`. Generate `num_samples` samples; and seed the resampler with `random_state`. """ dist: List[float] = [] y_pred = f.predict(X) # type:ignore # do the bootstrap: for trial in range(num_samples): sample_pred, sample_truth = resample( y_pred, y, random_state=trial + random_state ) # type:ignore score = mean_squared_error(y_true=sample_truth, y_pred=sample_pred) # type:ignore dist.append(score) return dist
def __init__(self): RegressorMixin.__init__(self) BaseEstimator.__init__(self)
def _reg(): return RegressorMixin()
def score(self, X, y, sample_weight=None): if len(X) == 1: output = self.predict(X) return output.shape[0] * mean_squared_error(y, output) return RegressorMixin.score(self, X, y, sample_weight=sample_weight)
def score( self, X: np.ndarray, y: np.ndarray, sample_weight: Optional[np.ndarray] = None ) -> float: return RegressorMixin.score(self, X, y, sample_weight)
def score(self, X, y, sample_weight=None): return RegressorMixin.score(self, X, y, sample_weight=sample_weight)
def __init__(self, base_estimator): RegressorMixin.__init__(self) BaseEstimator.__init__(self) self.base_estimator = base_estimator