def _init_estimator(self, y): if isinstance(y, dict): self.is_single_target = False self.y_stats = VectorDict(default_factory=functools.partial(Var)) self._update_estimator = self._update_estimator_multivariate else: self.y_stats = Var() self._update_estimator = self._update_estimator_univariate
def __init__(self, index_original: int, base_model: BaseTreeRegressor, created_on: int, base_drift_detector: base.DriftDetector, base_warning_detector: base.DriftDetector, is_background_learner, base_metric: RegressionMetric): super().__init__(index_original=index_original, base_model=base_model, created_on=created_on, base_drift_detector=base_drift_detector, base_warning_detector=base_warning_detector, is_background_learner=is_background_learner, base_metric=base_metric) self._var = Var() # Used to track drift
def __init__(self, split_test, stats, depth, adwin_delta, seed): stats = stats if stats else Var() super().__init__(split_test, stats, depth) self.adwin_delta = adwin_delta self._adwin = ADWIN(delta=self.adwin_delta) self._alternate_tree = None self._error_change = False self._rng = check_random_state(seed) # Normalization of info monitored by drift detectors (using Welford's algorithm) self._error_normalizer = Var(ddof=1)
class Slot: """ The element stored in the quantization hash. Each slot keeps the mean values of the numerical feature, as well as the variance and mean of the target. """ def __init__(self, x: float, y=typing.Union[float, VectorDict], weight: float = 1.0): self.x_stats = Mean() self.x_stats.update(x, weight) self.y_stats: typing.Union[Var, VectorDict] self._update_estimator: typing.Callable[ [typing.Union[float, VectorDict], float], None] self.is_single_target = True self._init_estimator(y) self._update_estimator(y, weight) def _init_estimator(self, y): if isinstance(y, dict): self.is_single_target = False self.y_stats = VectorDict(default_factory=functools.partial(Var)) self._update_estimator = self._update_estimator_multivariate else: self.y_stats = Var() self._update_estimator = self._update_estimator_univariate def _update_estimator_univariate(self, target, sample_weight): self.y_stats.update(target, sample_weight) def _update_estimator_multivariate(self, target, sample_weight): for t in target: self.y_stats[t].update(target[t], sample_weight) def __iadd__(self, o): self.x_stats += o.x_stats self.y_stats += o.y_stats return self def update(self, x, y, sample_weight): self.x_stats.update(x, sample_weight) self._update_estimator(y, sample_weight)
def update(self, att_val, target, sample_weight=1.0): if att_val is None or sample_weight is None: return else: try: estimator = self._statistics[att_val] except KeyError: if isinstance(target, dict): # Multi-target case self._statistics[att_val] = VectorDict(default_factory=lambda: Var()) self._update_estimator = self._update_estimator_multivariate else: self._statistics[att_val] = Var() estimator = self._statistics[att_val] self._update_estimator(estimator, target, sample_weight) return self
def best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only=True): candidate = AttributeSplitSuggestion(None, [{}], -float('inf')) if self._root is None: return candidate self._criterion = criterion self._pre_split_dist = pre_split_dist self._att_idx = att_idx # Handles both single-target and multi-target tasks if isinstance(pre_split_dist, VectorDict): self._aux_estimator = VectorDict( default_factory=functools.partial(Var)) else: self._aux_estimator = Var() best_split = self._find_best_split(self._root, candidate) # Delete auxiliary variables del self._criterion del self._pre_split_dist del self._att_idx del self._aux_estimator return best_split
def __iter__(self): aux_stats = (Var() if next(iter(self.hash.values())).is_single_target else VectorDict(default_factory=functools.partial(Var))) for i in sorted(self.hash.keys()): x = self.hash[i].x_stats.get() aux_stats += self.hash[i].y_stats yield x, aux_stats
class ForestMemberRegressor(BaseForestMember, base.Regressor): """Forest member class for regression""" def __init__( self, index_original: int, model: BaseTreeRegressor, created_on: int, drift_detector: base.DriftDetector, warning_detector: base.DriftDetector, is_background_learner, metric: RegressionMetric, ): super().__init__( index_original=index_original, model=model, created_on=created_on, drift_detector=drift_detector, warning_detector=warning_detector, is_background_learner=is_background_learner, metric=metric, ) self._var = Var() # Used to track drift def _drift_detector_input(self, y_true: float, y_pred: float): drift_input = y_true - y_pred self._var.update(drift_input) if self._var.mean.n == 1: return 0.5 # The expected error is the normalized mean error sd = math.sqrt(self._var.sigma) # We assume the error follows a normal distribution -> (empirical rule) # 99.73% of the values lie between [mean - 3*sd, mean + 3*sd]. We # assume this range for the normalized data. Hence, we can apply the # min-max norm to cope with ADWIN's requirements return (drift_input + 3 * sd) / (6 * sd) if sd > 0 else 0.5 def reset(self, n_samples_seen): super().reset(n_samples_seen) # Reset the stats for the drift detector self._var = Var() def predict_one(self, x): return self.model.predict_one(x)
def __init__(self, stats, depth, attr_obs, attr_obs_params, leaf_model, adwin_delta, seed): super().__init__(stats, depth, attr_obs, attr_obs_params, leaf_model) self.adwin_delta = adwin_delta self._adwin = ADWIN(delta=self.adwin_delta) self.error_change = False self._rng = check_random_state(seed) # Normalization of info monitored by drift detectors (using Welford's algorithm) self._error_normalizer = Var(ddof=1)
def __init__(self, stats, depth, splitter, adwin_delta, seed, **kwargs): super().__init__(stats, depth, splitter, **kwargs) self.adwin_delta = adwin_delta self._adwin = ADWIN(delta=self.adwin_delta) self._error_change = False self._rng = check_random_state(seed) # Normalization of info monitored by drift detectors (using Welford's algorithm) self._error_normalizer = Var(ddof=1)
def delta_loss_mean_var(self, delta_pred: float) -> Var: m = self.mean n = self.total_weight mean = delta_pred * m.gradient + 0.5 * m.hessian * delta_pred * delta_pred variance = self.variance covariance = self.covariance grad_term_var = delta_pred * delta_pred * variance.gradient hess_term_var = 0.25 * variance.hessian * (delta_pred ** 4.0) sigma = max(0.0, grad_term_var + hess_term_var + (delta_pred ** 3) * covariance) return Var._from_state(n, mean, sigma) # noqa
def __init__(self, att_val, target_val, sample_weight): self.att_val = att_val if isinstance(target_val, dict): self.estimator = VectorDict(default_factory=functools.partial(Var)) self._update_estimator = self._update_estimator_multivariate else: self.estimator = Var() self._update_estimator = self._update_estimator_univariate self._update_estimator(self, target_val, sample_weight) self._left = None self._right = None
def __init__(self, stats, depth, attr_obs, attr_obs_params): if stats is None: # Enforce the usage of Var to keep track of target statistics stats = Var() super().__init__(stats, depth, attr_obs, attr_obs_params)
def __init__(self, stats, depth, splitter, **kwargs): if stats is None: # Enforce the usage of Var to keep track of target statistics stats = Var() super().__init__(stats, depth, splitter, **kwargs)
def remove_bad_splits(self, criterion, last_check_ratio, last_check_vr, last_check_e, pre_split_dist): """Remove bad splits. Based on FIMT-DD's [^1] procedure to remove bad split candidates from the E-BST. This mechanism is triggered every time a split attempt fails. The rationale is to remove points whose split merit is much worse than the best candidate overall (for which the growth decision already failed). Let $m_1$ be the merit of the best split point and $m_2$ be the merit of the second best split candidate. The ratio $r = m_2/m_1$ along with the Hoeffding bound ($\\epsilon$) are used to decide upon creating a split. A split occurs when $r < 1 - \\epsilon$. A split candidate, with merit $m_i$, is considered badr if $m_i / m_1 < r - 2\\epsilon$. The rationale is the following: if the merit ratio for this point is smaller than the lower bound of $r$, then the true merit of that split relative to the best one is small. Hence, this candidate can be safely removed. To avoid excessive and costly manipulations of the E-BST to update the stored statistics, only the nodes whose children are all bad split points are pruned, as defined in [^1]. Parameters ---------- criterion The split criterion used by the regression tree. last_check_ratio The ratio between the merit of the second best split candidate and the merit of the best split candidate observed in the last failed split attempt. last_check_vr The merit (variance reduction) of the best split candidate observed in the last failed split attempt. last_check_e The Hoeffding bound value calculated in the last failed split attempt. pre_split_dist The complete statistics of the target observed in the leaf node. References ---------- [^1]: Ikonomovska, E., Gama, J., & Džeroski, S. (2011). Learning model trees from evolving data streams. Data mining and knowledge discovery, 23(1), 128-168. """ if self._root is None: return # Auxiliary variables self._criterion = criterion self._pre_split_dist = pre_split_dist self._last_check_ratio = last_check_ratio self._last_check_vr = last_check_vr self._last_check_e = last_check_e # Handles both single-target and multi-target tasks if isinstance(pre_split_dist, VectorDict): self._aux_estimator = VectorDict( default_factory=functools.partial(Var)) else: self._aux_estimator = Var() self._remove_bad_split_nodes(self._root) # Delete auxiliary variables del self._criterion del self._pre_split_dist del self._last_check_ratio del self._last_check_vr del self._last_check_e del self._aux_estimator
class GradHessStats: """Class used to monitor and update the gradient/hessian information in Stochastic Gradient Trees. Represents the aggregated gradient/hessian data in a node (global node statistics), category, or numerical feature's discretized bin. """ def __init__(self): self.g_var = Var() self.h_var = Var() self.gh_cov = Cov() def __iadd__(self, other): self.g_var += other.g_var self.h_var += other.h_var self.gh_cov += other.gh_cov return self def __isub__(self, other): self.g_var -= other.g_var self.h_var -= other.h_var self.gh_cov -= other.gh_cov return self def __add__(self, other): new = copy.deepcopy(self) new += other return new def __sub__(self, other): new = copy.deepcopy(self) new -= other return new def update(self, gh: GradHess, w: float = 1.0): self.g_var.update(gh.gradient, w) self.h_var.update(gh.hessian, w) self.gh_cov.update(gh.gradient, gh.hessian, w) @property def mean(self) -> GradHess: return GradHess(self.g_var.mean.get(), self.h_var.mean.get()) @property def variance(self) -> GradHess: return GradHess(self.g_var.get(), self.h_var.get()) @property def covariance(self) -> float: return self.gh_cov.get() @property def total_weight(self) -> float: return self.g_var.mean.n # This method ignores correlations between delta_pred and the gradients/hessians! Considering # delta_pred is derived from the gradient and hessian sample, this assumption is definitely # violated. However, as empirically demonstrated in the original SGT, this fact does not seem # to significantly impact on the obtained results. def delta_loss_mean_var(self, delta_pred: float) -> Var: m = self.mean n = self.total_weight mean = delta_pred * m.gradient + 0.5 * m.hessian * delta_pred * delta_pred variance = self.variance covariance = self.covariance grad_term_var = delta_pred * delta_pred * variance.gradient hess_term_var = 0.25 * variance.hessian * (delta_pred**4.0) sigma = max( 0.0, grad_term_var + hess_term_var + (delta_pred**3) * covariance) return Var._from_state(n, mean, sigma) # noqa
def __init__(self): self.g_var = Var() self.h_var = Var() self.gh_cov = Cov()
def __init__(self, radius: float = 0.01): super().__init__() self.radius = radius if radius > 0 else 0.01 self._x_var = Var() self._quantizer = FeatureQuantizer(radius=self.radius)
class NumericAttributeRegressionQuantizerObserver(AttributeObserver): """Quantizer observer (QO). Utilizes a dynamical hash-based quantization algorithm to keep track of the target statistics and evaluate split candidates. This class implements the algorithm described in [^1]. This attribute observer keeps an internal estimator of the input feature's variance. By doing that, QO can calculate better values for its radius parameter to be passed to future learning nodes. Parameters ---------- radius The quantization radius. References ---------- [^1]: Mastelini, S.M. and de Leon Ferreira, A.C.P., 2021. Using dynamical quantization to perform split attempts in online tree regressors. Pattern Recognition Letters. """ def __init__(self, radius: float = 0.01): super().__init__() self.radius = radius if radius > 0 else 0.01 self._x_var = Var() self._quantizer = FeatureQuantizer(radius=self.radius) def update(self, x, y, sample_weight): if x is None: return else: self._x_var.update(x, sample_weight) self._quantizer.update(x, y, sample_weight) def probability_of_attribute_value_given_class(self, x, y): raise NotImplementedError def best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only=True): candidate = AttributeSplitSuggestion(None, [{}], -math.inf) # The previously evaluated x value prev_x = None for (x, left_dist) in self._quantizer: # First hash element if prev_x is None: # In case the hash carries just one element return the null split if len(self._quantizer) == 1: return candidate prev_x = x continue right_dist = pre_split_dist - left_dist post_split_dists = [left_dist, right_dist] merit = criterion.merit_of_split(pre_split_dist, post_split_dists) if merit > candidate.merit: split_point = (prev_x + x) / 2.0 candidate = self._update_candidate(split_point, att_idx, post_split_dists, merit) prev_x = x return candidate @property def x_var(self): return self._x_var @staticmethod def _update_candidate(split_point, att_idx, post_split_dists, merit): num_att_binary_test = NumericAttributeBinaryTest( att_idx, split_point, True) candidate = AttributeSplitSuggestion(num_att_binary_test, post_split_dists, merit) return candidate
def reset(self, n_samples_seen): super().reset(n_samples_seen) # Reset the stats for the drift detector self._var = Var()