def __init__(self, split_test, stats, depth, adwin_delta, seed): stats = stats if stats else Var() super().__init__(split_test, stats, depth) self.adwin_delta = adwin_delta self._adwin = ADWIN(delta=self.adwin_delta) self._alternate_tree = None self._error_change = False self._rng = check_random_state(seed) # Normalization of info monitored by drift detectors (using Welford's algorithm) self._error_normalizer = Var(ddof=1)
def best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only=True): candidate = AttributeSplitSuggestion(None, [{}], -float('inf')) if self._root is None: return candidate self._criterion = criterion self._pre_split_dist = pre_split_dist self._att_idx = att_idx # Handles both single-target and multi-target tasks if isinstance(pre_split_dist, VectorDict): self._aux_estimator = VectorDict( default_factory=functools.partial(Var)) else: self._aux_estimator = Var() best_split = self._find_best_split(self._root, candidate) # Delete auxiliary variables del self._criterion del self._pre_split_dist del self._att_idx del self._aux_estimator return best_split
def update(self, att_val, target, sample_weight=1.0): if att_val is None or sample_weight is None: return else: try: estimator = self._statistics[att_val] except KeyError: if isinstance(target, dict): # Multi-target case self._statistics[att_val] = VectorDict(default_factory=lambda: Var()) self._update_estimator = self._update_estimator_multivariate else: self._statistics[att_val] = Var() estimator = self._statistics[att_val] self._update_estimator(estimator, target, sample_weight) return self
def _init_estimator(self, y): if isinstance(y, dict): self.is_single_target = False self.y_stats = VectorDict(default_factory=functools.partial(Var)) self._update_estimator = self._update_estimator_multivariate else: self.y_stats = Var() self._update_estimator = self._update_estimator_univariate
def __iter__(self): aux_stats = (Var() if next(iter(self.hash.values())).is_single_target else VectorDict(default_factory=functools.partial(Var))) for i in sorted(self.hash.keys()): x = self.hash[i].x_stats.get() aux_stats += self.hash[i].y_stats yield x, aux_stats
def __init__(self, stats, depth, splitter, adwin_delta, seed, **kwargs): super().__init__(stats, depth, splitter, **kwargs) self.adwin_delta = adwin_delta self._adwin = ADWIN(delta=self.adwin_delta) self._error_change = False self._rng = check_random_state(seed) # Normalization of info monitored by drift detectors (using Welford's algorithm) self._error_normalizer = Var(ddof=1)
def __init__(self, stats, depth, attr_obs, attr_obs_params, leaf_model, adwin_delta, seed): super().__init__(stats, depth, attr_obs, attr_obs_params, leaf_model) self.adwin_delta = adwin_delta self._adwin = ADWIN(delta=self.adwin_delta) self.error_change = False self._rng = check_random_state(seed) # Normalization of info monitored by drift detectors (using Welford's algorithm) self._error_normalizer = Var(ddof=1)
def __init__(self, index_original: int, base_model: BaseTreeRegressor, created_on: int, base_drift_detector: base.DriftDetector, base_warning_detector: base.DriftDetector, is_background_learner, base_metric: RegressionMetric): super().__init__(index_original=index_original, base_model=base_model, created_on=created_on, base_drift_detector=base_drift_detector, base_warning_detector=base_warning_detector, is_background_learner=is_background_learner, base_metric=base_metric) self._var = Var() # Used to track drift
def __init__(self, att_val, target_val, sample_weight): self.att_val = att_val if isinstance(target_val, dict): self.estimator = VectorDict(default_factory=functools.partial(Var)) self._update_estimator = self._update_estimator_multivariate else: self.estimator = Var() self._update_estimator = self._update_estimator_univariate self._update_estimator(self, target_val, sample_weight) self._left = None self._right = None
def __init__(self, stats, depth, attr_obs, attr_obs_params): if stats is None: # Enforce the usage of Var to keep track of target statistics stats = Var() super().__init__(stats, depth, attr_obs, attr_obs_params)
def __init__(self, stats, depth, splitter, **kwargs): if stats is None: # Enforce the usage of Var to keep track of target statistics stats = Var() super().__init__(stats, depth, splitter, **kwargs)
def remove_bad_splits(self, criterion, last_check_ratio, last_check_vr, last_check_e, pre_split_dist): """Remove bad splits. Based on FIMT-DD's [^1] procedure to remove bad split candidates from the E-BST. This mechanism is triggered every time a split attempt fails. The rationale is to remove points whose split merit is much worse than the best candidate overall (for which the growth decision already failed). Let $m_1$ be the merit of the best split point and $m_2$ be the merit of the second best split candidate. The ratio $r = m_2/m_1$ along with the Hoeffding bound ($\\epsilon$) are used to decide upon creating a split. A split occurs when $r < 1 - \\epsilon$. A split candidate, with merit $m_i$, is considered badr if $m_i / m_1 < r - 2\\epsilon$. The rationale is the following: if the merit ratio for this point is smaller than the lower bound of $r$, then the true merit of that split relative to the best one is small. Hence, this candidate can be safely removed. To avoid excessive and costly manipulations of the E-BST to update the stored statistics, only the nodes whose children are all bad split points are pruned, as defined in [^1]. Parameters ---------- criterion The split criterion used by the regression tree. last_check_ratio The ratio between the merit of the second best split candidate and the merit of the best split candidate observed in the last failed split attempt. last_check_vr The merit (variance reduction) of the best split candidate observed in the last failed split attempt. last_check_e The Hoeffding bound value calculated in the last failed split attempt. pre_split_dist The complete statistics of the target observed in the leaf node. References ---------- [^1]: Ikonomovska, E., Gama, J., & Džeroski, S. (2011). Learning model trees from evolving data streams. Data mining and knowledge discovery, 23(1), 128-168. """ if self._root is None: return # Auxiliary variables self._criterion = criterion self._pre_split_dist = pre_split_dist self._last_check_ratio = last_check_ratio self._last_check_vr = last_check_vr self._last_check_e = last_check_e # Handles both single-target and multi-target tasks if isinstance(pre_split_dist, VectorDict): self._aux_estimator = VectorDict( default_factory=functools.partial(Var)) else: self._aux_estimator = Var() self._remove_bad_split_nodes(self._root) # Delete auxiliary variables del self._criterion del self._pre_split_dist del self._last_check_ratio del self._last_check_vr del self._last_check_e del self._aux_estimator
def __init__(self, radius: float = 0.01): super().__init__() self.radius = radius if radius > 0 else 0.01 self._x_var = Var() self._quantizer = FeatureQuantizer(radius=self.radius)
def reset(self, n_samples_seen): super().reset(n_samples_seen) # Reset the stats for the drift detector self._var = Var()
def __init__(self): self.g_var = Var() self.h_var = Var() self.gh_cov = Cov()