Ejemplo n.º 1
0
def load_metrics():
    """Yields all the metrics."""

    for name, obj in inspect.getmembers(
            importlib.import_module('river.metrics'), inspect.isclass):

        if name == 'Metric':
            continue

        if issubclass(obj, metrics.Rolling):
            yield obj(metric=metrics.MSE(), window_size=42)
            continue

        elif name == 'RegressionMultiOutput':
            yield obj(metric=metrics.MSE())
            continue

        try:
            sig = inspect.signature(obj)
            yield obj(
                **{
                    param.name:
                    param.default if param.default != param.empty else 5
                    for param in sig.parameters.values()
                })
        except ValueError:
            yield obj()
Ejemplo n.º 2
0
def test_compose():

    metrics.MAE() + metrics.MSE()
    metrics.Accuracy() + metrics.LogLoss()

    with pytest.raises(ValueError):
        _ = metrics.MSE() + metrics.LogLoss()

    with pytest.raises(ValueError):
        _ = metrics.MSE() + metrics.MAE() + metrics.LogLoss()
Ejemplo n.º 3
0
 def reset(self):
     self.mae = metrics.Rolling(metrics.MAE(), window_size=self.window_size)
     self.mse = metrics.Rolling(metrics.MSE(), window_size=self.window_size)
     self.r2 = metrics.Rolling(metrics.R2(), window_size=self.window_size)
     self.sample_count = 0
     self.last_true_label = None
     self.last_prediction = None
Ejemplo n.º 4
0
 def __init__(self):
     super().__init__()
     self.mae = metrics.MAE()
     self.mse = metrics.MSE()
     self.r2 = metrics.R2()
     self.last_true_label = None
     self.last_prediction = None
Ejemplo n.º 5
0
 def __init__(
     self,
     # Forest parameters
     n_models: int = 10,
     max_features="sqrt",
     aggregation_method: str = "median",
     lambda_value: int = 6,
     metric: metrics.RegressionMetric = metrics.MSE(),
     disable_weighted_vote=True,
     drift_detector: base.DriftDetector = ADWIN(0.001),
     warning_detector: base.DriftDetector = ADWIN(0.01),
     # Tree parameters
     grace_period: int = 50,
     max_depth: int = None,
     split_confidence: float = 0.01,
     tie_threshold: float = 0.05,
     leaf_prediction: str = "model",
     leaf_model: base.Regressor = None,
     model_selector_decay: float = 0.95,
     nominal_attributes: list = None,
     splitter: Splitter = None,
     min_samples_split: int = 5,
     binary_split: bool = False,
     max_size: int = 500,
     memory_estimate_period: int = 2_000_000,
     stop_mem_management: bool = False,
Ejemplo n.º 6
0
 def __init__(self, window_size=200):
     super().__init__()
     self.window_size = window_size
     self.mae = metrics.Rolling(metrics.MAE(), window_size=self.window_size)
     self.mse = metrics.Rolling(metrics.MSE(), window_size=self.window_size)
     self.r2 = metrics.Rolling(metrics.R2(), window_size=self.window_size)
     self.sample_count = 0
     self.last_true_label = None
     self.last_prediction = None
Ejemplo n.º 7
0
    def __init__(
        self,
        # Forest parameters
        n_models: int = 10,
        max_features="sqrt",
        aggregation_method: str = "median",
        lambda_value: int = 6,
        metric: metrics.RegressionMetric = metrics.MSE(),
        disable_weighted_vote=True,
        drift_detector: base.DriftDetector = ADWIN(0.001),
        warning_detector: base.DriftDetector = ADWIN(0.01),
        # Tree parameters
        grace_period: int = 50,
        max_depth: int = None,
        split_confidence: float = 0.01,
        tie_threshold: float = 0.05,
        leaf_prediction: str = "model",
        leaf_model: base.Regressor = None,
        model_selector_decay: float = 0.95,
        nominal_attributes: list = None,
        splitter: Splitter = None,
        min_samples_split: int = 5,
        max_size: int = 100,
        memory_estimate_period: int = 2000000,
        seed: int = None,
        **kwargs,
    ):
        super().__init__(
            n_models=n_models,
            max_features=max_features,
            lambda_value=lambda_value,
            metric=metric,
            disable_weighted_vote=disable_weighted_vote,
            drift_detector=drift_detector,
            warning_detector=warning_detector,
            seed=seed,
        )

        self._n_samples_seen = 0
        self._base_member_class = ForestMemberRegressor

        # Tree parameters
        self.grace_period = grace_period
        self.max_depth = max_depth
        self.split_confidence = split_confidence
        self.tie_threshold = tie_threshold
        self.leaf_prediction = leaf_prediction
        self.leaf_model = leaf_model
        self.model_selector_decay = model_selector_decay
        self.nominal_attributes = nominal_attributes
        self.splitter = splitter
        self.min_samples_split = min_samples_split
        self.max_size = max_size
        self.memory_estimate_period = memory_estimate_period
        self.kwargs = kwargs

        if aggregation_method in self._VALID_AGGREGATION_METHOD:
            self.aggregation_method = aggregation_method
        else:
            raise ValueError(
                f"Invalid aggregation_method: {aggregation_method}.\n"
                f"Valid values are: {self._VALID_AGGREGATION_METHOD}"
            )
Ejemplo n.º 8
0
    (metrics.WeightedRecall(),
     partial(sk_metrics.recall_score, average='weighted')),
    (metrics.FBeta(beta=.5), partial(sk_metrics.fbeta_score, beta=.5)),
    (metrics.MacroFBeta(beta=.5),
     partial(sk_metrics.fbeta_score, beta=.5, average='macro')),
    (metrics.MicroFBeta(beta=.5),
     partial(sk_metrics.fbeta_score, beta=.5, average='micro')),
    (metrics.WeightedFBeta(beta=.5),
     partial(sk_metrics.fbeta_score, beta=.5, average='weighted')),
    (metrics.F1(), sk_metrics.f1_score),
    (metrics.MacroF1(), partial(sk_metrics.f1_score, average='macro')),
    (metrics.MicroF1(), partial(sk_metrics.f1_score, average='micro')),
    (metrics.WeightedF1(), partial(sk_metrics.f1_score, average='weighted')),
    (metrics.MCC(), sk_metrics.matthews_corrcoef),
    (metrics.MAE(), sk_metrics.mean_absolute_error),
    (metrics.MSE(), sk_metrics.mean_squared_error),
]


@pytest.mark.parametrize('metric, sk_metric', [
    pytest.param(metric, sk_metric, id=f'{metric.__class__.__name__}')
    for metric, sk_metric in TEST_CASES
])
@pytest.mark.filterwarnings('ignore::RuntimeWarning')
@pytest.mark.filterwarnings(
    'ignore::sklearn.metrics.classification.UndefinedMetricWarning')
def test_metric(metric, sk_metric):

    # Check str works
    str(metric)
Ejemplo n.º 9
0
class AdaptiveRandomForestRegressor(BaseForest, base.Regressor):
    r"""Adaptive Random Forest regressor.

    The 3 most important aspects of Adaptive Random Forest [^1] are:

    1. inducing diversity through re-sampling

    2. inducing diversity through randomly selecting subsets of features for
       node splits

    3. drift detectors per base tree, which cause selective resets in response
       to drifts

    Notice that this implementation is slightly different from the original
    algorithm proposed in [^2]. The `HoeffdingTreeRegressor` is used as base
    learner, instead of `FIMT-DD`. It also adds a new strategy to monitor the
    predictions and check for concept drifts. The deviations of the predictions
    to the target are monitored and normalized in the [0, 1] range to fulfill ADWIN's
    requirements. We assume that the data subjected to the normalization follows
    a normal distribution, and thus, lies within the interval of the mean $\pm3\sigma$.

    Parameters
    ----------
    n_models
        Number of trees in the ensemble.
    max_features
        Max number of attributes for each node split.<br/>
        - If `int`, then consider `max_features` at each split.<br/>
        - If `float`, then `max_features` is a percentage and
          `int(max_features * n_features)` features are considered per split.<br/>
        - If "sqrt", then `max_features=sqrt(n_features)`.<br/>
        - If "log2", then `max_features=log2(n_features)`.<br/>
        - If None, then ``max_features=n_features``.
    lambda_value
        The lambda value for bagging (lambda=6 corresponds to Leveraging Bagging).
    metric
        Metric used to track trees performance within the ensemble. Depending,
        on the configuration, this metric is also used to weight predictions
        from the members of the ensemble.
    aggregation_method
        The method to use to aggregate predictions in the ensemble.<br/>
        - 'mean'<br/>
        - 'median' - If selected will disable the weighted vote.
    disable_weighted_vote
        If `True`, disables the weighted vote prediction, i.e. does not assign
        weights to individual tree's predictions and uses the arithmetic mean
        instead. Otherwise will use the `metric` value to weight predictions.
    drift_detector
        Drift Detection method. Set to None to disable Drift detection.
    warning_detector
        Warning Detection method. Set to None to disable warning detection.
    grace_period
        [*Tree parameter*] Number of instances a leaf should observe between
        split attempts.
    max_depth
        [*Tree parameter*] The maximum depth a tree can reach. If `None`, the
        tree will grow indefinitely.
    split_confidence
        [*Tree parameter*] Allowed error in split decision, a value closer to 0
        takes longer to decide.
    tie_threshold
        [*Tree parameter*] Threshold below which a split will be forced to break
        ties.
    leaf_prediction
        [*Tree parameter*] Prediction mechanism used at leaves.</br>
        - 'mean' - Target mean</br>
        - 'model' - Uses the model defined in `leaf_model`</br>
        - 'adaptive' - Chooses between 'mean' and 'model' dynamically</br>
    leaf_model
        [*Tree parameter*] The regression model used to provide responses if
        `leaf_prediction='model'`. If not provided, an instance of
        `river.linear_model.LinearRegression` with the default hyperparameters
         is used.
    model_selector_decay
        The exponential decaying factor applied to the learning models' squared
        errors, that are monitored if `leaf_prediction='adaptive'`. Must be
        between `0` and `1`. The closer to `1`, the more importance is going to
        be given to past observations. On the other hand, if its value
        approaches `0`, the recent observed errors are going to have more
        influence on the final decision.
    nominal_attributes
        [*Tree parameter*] List of Nominal attributes. If empty, then assume that
        all attributes are numerical.
    attr_obs
        [*Tree parameter*] The attribute observer (AO) used to monitor the target
        statistics of numeric features and perform splits. Parameters can be passed to the
        AOs (when supported) by using `attr_obs_params`. Valid options are:</br>
        - `'e-bst'`: Extended Binary Search Tree (E-BST). This AO has no parameters.</br>
        See notes for more information about the supported AOs.
    attr_obs_params
        [*Tree parameter*] Parameters passed to the numeric AOs. See `attr_obs`
        for more information.
    min_samples_split
        [*Tree parameter*] The minimum number of samples every branch resulting from a split
        candidate must have to be considered valid.
    max_size
        [*Tree parameter*] Maximum memory (MB) consumed by the tree.
    memory_estimate_period
        [*Tree parameter*] Number of instances between memory consumption checks.
    seed
        If `int`, `seed` is used to seed the random number generator;
        If `RandomState`, `seed` is the random number generator;
        If `None`, the random number generator is the `RandomState` instance
        used by `np.random`.
    kwargs
        Other parameters passed to `river.tree.BaseHoeffdingTree`.

    Notes
    -----
    Hoeffding trees rely on Attribute Observer (AO) algorithms to monitor input features
    and perform splits. Nominal features can be easily dealt with, since the partitions
    are well-defined. Numerical features, however, require more sophisticated solutions.
    Currently, only one AO is supported in `river` for regression trees:

    - The Extended Binary Search Tree (E-BST) uses an exhaustive algorithm to find split
    candidates, similarly to batch decision tree algorithms. It ends up storing all
    observations between split attempts. However, E-BST automatically removes bad split
    points periodically from its structure and, thus, alleviates the memory and time
    costs involved in its usage.

    References
    ----------
    [^1]: Gomes, H.M., Bifet, A., Read, J., Barddal, J.P., Enembreck, F.,
          Pfharinger, B., Holmes, G. and Abdessalem, T., 2017. Adaptive random
          forests for evolving data stream classification. Machine Learning,
          106(9-10), pp.1469-1495.

    [^2]: Gomes, H.M., Barddal, J.P., Boiko, L.E., Bifet, A., 2018.
          Adaptive random forests for data stream regression. ESANN 2018.

    Examples
    --------
    >>> from river import datasets
    >>> from river import evaluate
    >>> from river import metrics
    >>> from river import ensemble
    >>> from river import preprocessing

    >>> dataset = datasets.TrumpApproval()

    >>> model = (
    ...     preprocessing.StandardScaler() |
    ...     ensemble.AdaptiveRandomForestRegressor(n_models=3, seed=42)
    ... )

    >>> metric = metrics.MAE()

    >>> evaluate.progressive_val_score(dataset, model, metric)
    MAE: 1.870913

    """

    _MEAN = "mean"
    _MEDIAN = "median"
    _VALID_AGGREGATION_METHOD = [_MEAN, _MEDIAN]

    def __init__(
        self,
        # Forest parameters
        n_models: int = 10,
        max_features="sqrt",
        aggregation_method: str = "median",
        lambda_value: int = 6,
<<<<<<< HEAD
        metric: RegressionMetric = MSE(),
=======
        metric: metrics.RegressionMetric = metrics.MSE(),
>>>>>>> upstream/master
        disable_weighted_vote=True,
        drift_detector: base.DriftDetector = ADWIN(0.001),
        warning_detector: base.DriftDetector = ADWIN(0.01),
        # Tree parameters
        grace_period: int = 50,
        max_depth: int = None,
        split_confidence: float = 0.01,
        tie_threshold: float = 0.05,
        leaf_prediction: str = "model",
        leaf_model: base.Regressor = None,
        model_selector_decay: float = 0.95,
        nominal_attributes: list = None,
        attr_obs: str = "e-bst",
        attr_obs_params: dict = None,
        min_samples_split: int = 5,
        max_size: int = 100,
        memory_estimate_period: int = 2000000,
        seed: int = None,
        **kwargs,
    ):
        super().__init__(
            n_models=n_models,
            max_features=max_features,
            lambda_value=lambda_value,
            metric=metric,
            disable_weighted_vote=disable_weighted_vote,
            drift_detector=drift_detector,
            warning_detector=warning_detector,
            seed=seed,
        )

        self._n_samples_seen = 0
        self._base_member_class = ForestMemberRegressor

        # Tree parameters
        self.grace_period = grace_period
        self.max_depth = max_depth
        self.split_confidence = split_confidence
        self.tie_threshold = tie_threshold
        self.leaf_prediction = leaf_prediction
        self.leaf_model = leaf_model
        self.model_selector_decay = model_selector_decay
        self.nominal_attributes = nominal_attributes
        self.attr_obs = attr_obs
        self.attr_obs_params = attr_obs_params
        self.min_samples_split = min_samples_split
        self.max_size = max_size
        self.memory_estimate_period = memory_estimate_period
        self.kwargs = kwargs

        if aggregation_method in self._VALID_AGGREGATION_METHOD:
            self.aggregation_method = aggregation_method
        else:
            raise ValueError(
                f"Invalid aggregation_method: {aggregation_method}.\n"
                f"Valid values are: {self._VALID_AGGREGATION_METHOD}"
            )
Ejemplo n.º 10
0
 def reset(self):
     self.mae = metrics.MAE()
     self.mse = metrics.MSE()
     self.r2 = metrics.R2()
     self.last_true_label = None
     self.last_prediction = None
Ejemplo n.º 11
0
    def build_model_4snarimax(self):
        if os.path.exists(
                self.pck_filename
        ):  #if model backup exists then load it and update model from start1 to start2
            src_bck = pickle.load(open(self.pck_filename, 'rb'))
            model = src_bck.snarimax_model
            metric = src_bck.snarimax_metric
            self.snarimax_para = src_bck.snarimax_para
            self.snarimax_model = model
            self.snarimax_metric = metric

            start1 = src_bck.data.index[-1]
            start2 = self.data.index[
                -1]  #self.data.index[-self.data.index[-1].weekday()]

        else:  #if model backup does not exist then rebuild model from the start
            p, d, q, m, sp, sd, sq = self.snarimax_para
            extract_features = compose.TransformerUnion(get_ordinal_date)
            model = (
                extract_features | time_series.SNARIMAX(
                    p=p,
                    d=d,
                    q=q,
                    m=m,
                    sp=sp,
                    sd=sd,
                    sq=sq,
                    regressor=(
                        #preprocessing.Normalizer() |
                        preprocessing.AdaptiveStandardScaler(alpha=0.1)
                        | preprocessing.StandardScaler() |

                        #preprocessing.RobustScaler(with_scaling=True) |
                        linear_model.LinearRegression(
                            intercept_init=0,
                            optimizer=optim.SGD(0.0001),  #important parameter
                            #optimizer=optim.AdaDelta(0.8,0.00001), #important parameter
                            #optimizer=optim.AMSGrad(lr=0.01,beta_1=0.8,beta_2=0.1),
                            intercept_lr=0.001))))

            metric = metrics.Rolling(metrics.MSE(), self.dd_historic)
            #metric = metrics.MSE()

            start1 = self.data.index[0]
            start2 = self.data.index[
                -1]  #self.data.index[-self.data.index[-1].weekday()]

        if start1 < start2:
            for t in pd.date_range(start1, start2, freq='D'):
                x, y = self.snarimax_data.loc[t][['ds', 'temp']].values
                y_pred = model.forecast(horizon=1, xs=[x])
                #print(x,y,y_pred[0],y-y_pred[0])
                model = model.learn_one(x, y)
                metric = metric.update(y, y_pred[0])

            self.snarimax_model = model
            self.snarimax_metric = metric
            with open(self.pck_filename, 'wb') as fh:
                pickle.dump(self, fh)

            #for t in pd.date_range(start1, start2):
            #    x = self.snarimax_data.loc[pd.date_range(t-timedelta(self.dd_historic),t)][['ds']].values
            #    y = self.snarimax_data.loc[pd.date_range(t-timedelta(self.dd_historic),t)][['temp']].values
            #    x = np.hstack(x)
            #    y = np.hstack(y)
            #    y_pred = model.forecast(horizon=self.dd_historic+1, xs=x)
            #    for i in range(0,self.dd_historic):
            #        model = model.learn_one(x[i], y[i])
            #        metric = metric.update(y[i], y_pred[i])

        return