def __init__( self, # Forest parameters n_models: int = 10, max_features="sqrt", aggregation_method: str = "median", lambda_value: int = 6, metric: metrics.RegressionMetric = metrics.MSE(), disable_weighted_vote=True, drift_detector: base.DriftDetector = ADWIN(0.001), warning_detector: base.DriftDetector = ADWIN(0.01), # Tree parameters grace_period: int = 50, max_depth: int = None, split_confidence: float = 0.01, tie_threshold: float = 0.05, leaf_prediction: str = "model", leaf_model: base.Regressor = None, model_selector_decay: float = 0.95, nominal_attributes: list = None, splitter: Splitter = None, min_samples_split: int = 5, binary_split: bool = False, max_size: int = 500, memory_estimate_period: int = 2_000_000, stop_mem_management: bool = False,
def __init__(self, stats, depth, attr_obs, attr_obs_params, adwin_delta, seed): super().__init__(stats, depth, attr_obs, attr_obs_params) self.adwin_delta = adwin_delta self._adwin = ADWIN(delta=self.adwin_delta) self.error_change = False self._rng = check_random_state(seed)
def __init__( self, n_models: int = 10, max_features: typing.Union[bool, str, int] = "sqrt", lambda_value: int = 6, metric: metrics.MultiClassMetric = metrics.Accuracy(), disable_weighted_vote=False, drift_detector: typing.Union[base.DriftDetector, None] = ADWIN(delta=0.001), warning_detector: typing.Union[base.DriftDetector, None] = ADWIN(delta=0.01), # Tree parameters grace_period: int = 50, max_depth: int = None, split_criterion: str = "info_gain", split_confidence: float = 0.01, tie_threshold: float = 0.05, leaf_prediction: str = "nba", nb_threshold: int = 0, nominal_attributes: list = None, splitter: Splitter = None, binary_split: bool = False, max_size: int = 32, memory_estimate_period: int = 2_000_000, stop_mem_management: bool = False,
def __init__(self, model=HoeffdingTreeClassifier(grace_period=50, split_confidence=0.01), n_models: int = 100, subspace_size: typing.Union[int, float, str] = .6, training_method: str = "patches", lam: float = 6.0, drift_detector: typing.Union[base.DriftDetector, None] = ADWIN(delta=1e-5), warning_detector: base.DriftDetector = ADWIN(delta=1e-4), disable_weighted_vote: bool = False, nominal_attributes=None, seed=None, metric: MultiClassMetric = Accuracy()): super().__init__([None ]) # List of models is properly initialized later self.models = [] self.model = model # Not restricted to a specific base estimator. self.n_models = n_models self.subspace_size = subspace_size self.training_method = training_method self.lam = lam self.drift_detector = drift_detector self.warning_detector = warning_detector self.disable_weighted_vote = disable_weighted_vote self.metric = metric self.nominal_attributes = nominal_attributes if nominal_attributes else [] self.seed = seed self._rng = check_random_state(self.seed) self._n_samples_seen = 0 self._subspaces = None self._base_learner_class = StreamingRandomPatchesBaseLearner
def __init__(self, split_test, stats, depth, adwin_delta, seed): super().__init__(split_test, stats, depth) self.adwin_delta = adwin_delta self._adwin = ADWIN(delta=self.adwin_delta) self._alternate_tree = None self._error_change = False self._rng = check_random_state(seed)
def __init__(self, stats, *children, adwin_delta, seed, **attributes): super().__init__(stats, *children, **attributes) self.adwin_delta = adwin_delta self._adwin = ADWIN(delta=self.adwin_delta) self._alternate_tree = None self._error_change = False self._rng = check_random_state(seed)
def __init__( self, n_models: int = 10, max_features: typing.Union[bool, str, int] = 'sqrt', lambda_value: int = 6, metric: MultiClassMetric = Accuracy(), disable_weighted_vote=False, drift_detector: typing.Union[base.DriftDetector, None] = ADWIN(delta=0.001), warning_detector: typing.Union[base.DriftDetector, None] = ADWIN(delta=0.01), # Tree parameters max_size: int = 32, memory_estimate_period: int = 2000000, grace_period: int = 50, split_criterion: str = 'info_gain', split_confidence: float = 0.01, tie_threshold: float = 0.05, binary_split=False, stop_mem_management=False, remove_poor_attrs=False, merit_preprune=True, leaf_prediction: str = 'nba', nb_threshold: int = 0, nominal_attributes: list = None, attr_obs: str = 'gaussian', attr_obs_params: dict = None, max_depth: int = None, seed=None): super().__init__(n_models=n_models, max_features=max_features, lambda_value=lambda_value, metric=metric, disable_weighted_vote=disable_weighted_vote, drift_detector=drift_detector, warning_detector=warning_detector, seed=seed) self._n_samples_seen = 0 self._base_member_class = ForestMemberClassifier # Tree parameters self.max_size = max_size self.memory_estimate_period = memory_estimate_period self.grace_period = grace_period self.split_criterion = split_criterion self.split_confidence = split_confidence self.tie_threshold = tie_threshold self.binary_split = binary_split self.stop_mem_management = stop_mem_management self.remove_poor_attrs = remove_poor_attrs self.merit_preprune = merit_preprune self.leaf_prediction = leaf_prediction self.nb_threshold = nb_threshold self.nominal_attributes = nominal_attributes self.attr_obs = attr_obs self.attr_obs_params = attr_obs_params self.max_depth = max_depth
def __init__( self, n_models: int = 10, max_features: typing.Union[bool, str, int] = "sqrt", lambda_value: int = 6, metric: metrics.MultiClassMetric = metrics.Accuracy(), disable_weighted_vote=False, drift_detector: typing.Union[base.DriftDetector, None] = ADWIN(delta=0.001), warning_detector: typing.Union[base.DriftDetector, None] = ADWIN(delta=0.01), # Tree parameters grace_period: int = 50, max_depth: int = None, split_criterion: str = "info_gain", split_confidence: float = 0.01, tie_threshold: float = 0.05, leaf_prediction: str = "nba", nb_threshold: int = 0, nominal_attributes: list = None, attr_obs: str = "gaussian", attr_obs_params: dict = None, max_size: int = 32, memory_estimate_period: int = 2000000, seed: int = None, **kwargs, ): super().__init__( n_models=n_models, max_features=max_features, lambda_value=lambda_value, metric=metric, disable_weighted_vote=disable_weighted_vote, drift_detector=drift_detector, warning_detector=warning_detector, seed=seed, ) self._n_samples_seen = 0 self._base_member_class = ForestMemberClassifier # Tree parameters self.grace_period = grace_period self.max_depth = max_depth self.split_criterion = split_criterion self.split_confidence = split_confidence self.tie_threshold = tie_threshold self.leaf_prediction = leaf_prediction self.nb_threshold = nb_threshold self.nominal_attributes = nominal_attributes self.attr_obs = attr_obs self.attr_obs_params = attr_obs_params self.max_size = max_size self.memory_estimate_period = memory_estimate_period self.kwargs = kwargs
def __init__( self, model: base.Estimator = None, n_models: int = 10, subspace_size: typing.Union[int, float, str] = 0.6, training_method: str = "patches", lam: float = 6.0, drift_detector: base.DriftDetector = None, warning_detector: base.DriftDetector = None, disable_detector: str = "off", disable_weighted_vote: bool = False, seed=None, metric: Metric = None, ): if model is None: model = HoeffdingTreeClassifier(grace_period=50, split_confidence=0.01) if drift_detector is None: drift_detector = ADWIN(delta=1e-5) if warning_detector is None: warning_detector = ADWIN(delta=1e-4) if disable_detector == "off": pass elif disable_detector == "drift": drift_detector = None warning_detector = None elif disable_detector == "warning": warning_detector = None else: raise AttributeError( f"{disable_detector} is not a valid value for disable_detector.\n" f"Valid options are: 'off', 'drift', 'warning'") if metric is None: metric = Accuracy() super().__init__( model=model, n_models=n_models, subspace_size=subspace_size, training_method=training_method, lam=lam, drift_detector=drift_detector, warning_detector=warning_detector, disable_detector=disable_detector, disable_weighted_vote=disable_weighted_vote, seed=seed, metric=metric, ) self._base_learner_class = BaseSRPClassifier
def adwin(data): adwin = ADWIN() i=0 val=0 print(data) drifts = [] for row in data: in_drift, in_warning = adwin.update(row['count']) if in_drift: print(f"Change detected at index {row['date']}, input value: {row['count']}") drifts.append({'date':row['date'],'count':row['count']}) return drifts
def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_branch=None): if tree.bootstrap_sampling: # Perform bootstrap-sampling k = self._rng.poisson(1.0) if k > 0: sample_weight = sample_weight * k aux = self.prediction(x, tree=tree) class_prediction = max(aux, key=aux.get) if aux else None is_correct = y == class_prediction if self._adwin is None: self._adwin = ADWIN(delta=self.adwin_delta) old_error = self.error_estimation # Update ADWIN self._error_change, _ = self._adwin.update(int(not is_correct)) # Error is decreasing if self._error_change and old_error > self.error_estimation: self._error_change = False # Update statistics super().learn_one(x, y, sample_weight=sample_weight, tree=tree) weight_seen = self.total_weight if weight_seen - self.last_split_attempt_at >= tree.grace_period: if self.depth >= tree.max_depth: # Depth-based pre-pruning self.deactivate() tree._n_inactive_leaves += 1 tree._n_active_leaves -= 1 elif self.is_active(): tree._attempt_to_split( self, parent, parent_branch, adwin_delta=tree.adwin_confidence, seed=tree.seed, ) self.last_split_attempt_at = weight_seen
def __init__(self, model: base.Classifier, n_models: int = 10, w: float = 6, adwin_delta: float = 0.002, bagging_method: str = 'bag', seed: int = None): super().__init__(model=model, n_models=n_models, seed=seed) self.n_detected_changes = 0 self.w = w self.adwin_delta = adwin_delta self.bagging_method = bagging_method self._drift_detectors = [ copy.deepcopy(ADWIN(delta=self.adwin_delta)) for _ in range(self.n_models) ] # Set bagging function if bagging_method == 'bag': self._bagging_fct = self._leveraging_bag elif bagging_method == 'me': self._bagging_fct = self._leveraging_bag_me elif bagging_method == 'half': self._bagging_fct = self._leveraging_bag_half elif bagging_method == 'wt': self._bagging_fct = self._leveraging_bag_wt elif bagging_method == 'subag': self._bagging_fct = self._leveraging_subag else: raise ValueError(f"Invalid bagging_method: {bagging_method}\n" f"Valid options: {self._BAGGING_METHODS}")
def learn_one(self, x, y): change_detected = False for i, model in enumerate(self): k = self._bagging_fct(x=x, y=y, model_idx=i) for _ in range(k): model.learn_one(x, y) y_pred = self.models[i].predict_one(x) if y_pred is not None: incorrectly_classifies = int(y_pred != y) error = self._drift_detectors[i].estimation self._drift_detectors[i].update(incorrectly_classifies) if self._drift_detectors[i].change_detected: if self._drift_detectors[i].estimation > error: change_detected = True if change_detected: self.n_detected_changes += 1 max_error_idx = max( range(len(self._drift_detectors)), key=lambda j: self._drift_detectors[j].estimation) self.models[max_error_idx] = copy.deepcopy(self.model) self._drift_detectors[max_error_idx] = ADWIN( delta=self.adwin_delta) return self
def learn_one(self, x, y): change_detected = False for i, model in enumerate(self): for _ in range(self._rng.poisson(1)): model.learn_one(x, y) try: y_pred = model.predict_one(x) error_estimation = self._drift_detectors[i].estimation self._drift_detectors[i].update(int(y_pred == y)) if self._drift_detectors[i].change_detected: if self._drift_detectors[i].estimation > error_estimation: change_detected = True except ValueError: change_detected = False if change_detected: max_error_idx = max( range(len(self._drift_detectors)), key=lambda j: self._drift_detectors[j].estimation) self.models[max_error_idx] = copy.deepcopy(self.model) self._drift_detectors[max_error_idx] = ADWIN() return self
def __adjust_ensemble_size(self): if len(self.classes) != len(self.ensemble): if len(self.classes) > len(self.ensemble): for i in range(len(self.ensemble), len(self.classes)): self.ensemble.append(cp.deepcopy(self.base_estimator)) self.actual_n_estimators += 1 self.adwin_ensemble.append(ADWIN())
def __init__( self, model: base.Classifier, param_grid, population_size=10, sampling_size=1, metric=metrics.Accuracy, sampling_rate=1000, w: float = 6, adwin_delta: float = 0.002, bagging_method: str = "bag", seed: int = None, ): param_iter = ParameterSampler(param_grid, population_size) param_list = list(param_iter) param_list = [dict((k, v) for (k, v) in d.items()) for d in param_list] super().__init__( self._initialize_model(model=model, params=params) for params in param_list) self.param_grid = param_grid self.population_size = population_size self.sampling_size = sampling_size self.metric = metric self.sampling_rate = sampling_rate self.n_models = population_size self.model = model self.seed = seed self._rng = np.random.RandomState(seed) self._i = 0 self._population_metrics = [ copy.deepcopy(metric()) for _ in range(self.n_models) ] self._drift_detectors = [ copy.deepcopy(ADWIN(delta=adwin_delta)) for _ in range(self.n_models) ] self.n_detected_changes = 0 self.w = w self.adwin_delta = adwin_delta self.bagging_method = bagging_method # Set bagging function if bagging_method == "bag": self._bagging_fct = self._leveraging_bag elif bagging_method == "me": self._bagging_fct = self._leveraging_bag_me elif bagging_method == "half": self._bagging_fct = self._leveraging_bag_half elif bagging_method == "wt": self._bagging_fct = self._leveraging_bag_wt elif bagging_method == "subag": self._bagging_fct = self._leveraging_subag else: raise ValueError(f"Invalid bagging_method: {bagging_method}\n" f"Valid options: {self._BAGGING_METHODS}")
def __configure(self): if hasattr(self.base_estimator, "reset"): self.base_estimator.reset() self.actual_n_estimators = self.n_estimators self.ensemble = [cp.deepcopy(self.base_estimator) for _ in range(self.actual_n_estimators)] self.adwin_ensemble = [ADWIN(self.delta) for _ in range(self.actual_n_estimators)] self._random_state = check_random_state(self.random_state) self.n_detected_changes = 0 self.classes = None self.init_matrix_codes = True
def __adjust_ensemble_size(self): if len(self.classes) != len(self.ensemble): if len(self.classes) > len(self.ensemble): for i in range(len(self.ensemble), len(self.classes)): self.ensemble.append(cp.deepcopy(self.base_estimator)) self.actual_n_estimators += 1 self.adwin_ensemble.append(ADWIN()) self.lam_sc = np.zeros(self.actual_n_estimators) self.lam_pos = np.zeros(self.actual_n_estimators) self.lam_neg = np.zeros(self.actual_n_estimators) self.lam_sw = np.zeros(self.actual_n_estimators) self.epsilon = np.zeros(self.actual_n_estimators)
def demo(): """ _test_adwin In this demo, an ADWIN object evaluates a sequence of numbers corresponding to 2 distributions. The ADWIN object indicates the indices where change is detected. The first half of the data is a sequence of randomly generated 0's and 1's. The second half of the data is a normal distribution of integers from 0 to 7. """ adwin = ADWIN() size = 2000 change_start = 999 np.random.seed(1) data_stream = np.random.randint(2, size=size) data_stream[change_start:] = np.random.randint(8, size=size - change_start) for i in range(size): change_detected, _ = adwin.update(data_stream[i]) if change_detected: print('Change has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i))
def __configure(self): if hasattr(self.base_estimator, "reset"): self.base_estimator.reset() self.actual_n_estimators = self.n_estimators self.adwin_ensemble = [] for i in range(self.actual_n_estimators): self.adwin_ensemble.append(ADWIN()) self.ensemble = [ cp.deepcopy(self.base_estimator) for _ in range(self.actual_n_estimators) ] self._random_state = check_random_state(self.random_state)
def __configure(self): if hasattr(self.base_estimator, "reset"): self.base_estimator.reset() self.actual_n_estimators = self.n_estimators self.adwin_ensemble = [] for i in range(self.actual_n_estimators): self.adwin_ensemble.append(ADWIN()) self.ensemble = [ cp.deepcopy(self.base_estimator) for _ in range(self.actual_n_estimators) ] self._random_state = check_random_state(self.random_state) self.lam_sc = np.zeros(self.actual_n_estimators) self.lam_pos = np.zeros(self.actual_n_estimators) self.lam_neg = np.zeros(self.actual_n_estimators) self.lam_sw = np.zeros(self.actual_n_estimators) self.epsilon = np.zeros(self.actual_n_estimators) self.n_pos = 0 self.n_neg = 0
def learn_one(self, x: dict, y: base.typing.ClfTarget, **kwargs): # Create Dataset if not initialized # Check if population needs to be updated if self._i % self.sampling_rate == 0: scores = [be.get() for be in self._population_metrics] idx_best = scores.index(max(scores)) idx_worst = scores.index(min(scores)) child = self._mutate_estimator(estimator=self[idx_best]) self.models[idx_worst] = child #self.population_metrics[idx_worst] = copy.deepcopy(self.metric()) change_detected = False for i, model in enumerate(self): self._population_metrics[i].update(y_true=y, y_pred=model.predict_one(x)) k = self._bagging_fct(x=x, y=y, model_idx=i) for _ in range(k): model.learn_one(x, y) y_pred = self.models[i].predict_one(x) if y_pred is not None: incorrectly_classifies = int(y_pred != y) error = self._drift_detectors[i].estimation self._drift_detectors[i].update(incorrectly_classifies) if self._drift_detectors[i].change_detected: if self._drift_detectors[i].estimation > error: change_detected = True if change_detected: self.n_detected_changes += 1 max_error_idx = max( range(len(self._drift_detectors)), key=lambda j: self._drift_detectors[j].estimation, ) self.models[max_error_idx] = copy.deepcopy(self.model) self._drift_detectors[max_error_idx] = ADWIN( delta=self.adwin_delta) return self
class AdaLearningNodeClassifier(LearningNodeNBA, AdaNode): """Learning node for Hoeffding Adaptive Tree. Parameters ---------- stats Initial class observations. depth The depth of the learning node in the tree. attr_obs The numeric attribute observer algorithm used to monitor target statistics and perform split attempts. attr_obs_params The parameters passed to the numeric attribute observer algorithm. adwin_delta The delta parameter of ADWIN. seed Seed to control the generation of random numbers and support reproducibility. """ def __init__(self, stats, depth, attr_obs, attr_obs_params, adwin_delta, seed): super().__init__(stats, depth, attr_obs, attr_obs_params) self.adwin_delta = adwin_delta self._adwin = ADWIN(delta=self.adwin_delta) self.error_change = False self._rng = check_random_state(seed) @property def n_leaves(self): return 1 @property def error_estimation(self): return self._adwin.estimation @property def error_width(self): return self._adwin.width def error_is_null(self): return self._adwin is None def kill_tree_children(self, hat): pass def learn_one(self, x, y, *, sample_weight=1., tree=None, parent=None, parent_branch=-1): if tree.bootstrap_sampling: # Perform bootstrap-sampling k = self._rng.poisson(1.0) if k > 0: sample_weight = sample_weight * k aux = self.leaf_prediction(x, tree=tree) class_prediction = max(aux, key=aux.get) if aux else None is_correct = (y == class_prediction) if self._adwin is None: self._adwin = ADWIN(delta=self.adwin_delta) old_error = self.error_estimation # Update ADWIN self.error_change, _ = self._adwin.update(int(not is_correct)) # Error is decreasing if self.error_change and old_error > self.error_estimation: self.error_change = False # Update statistics super().learn_one(x, y, sample_weight=sample_weight, tree=tree) weight_seen = self.total_weight if weight_seen - self.last_split_attempt_at >= tree.grace_period: if self.depth >= tree.max_depth: # Depth-based pre-pruning self.deactivate() tree._n_inactive_leaves += 1 tree._n_active_leaves -= 1 else: tree._attempt_to_split(self, parent, parent_branch) self.last_split_attempt_at = weight_seen # Override LearningNodeNBA def leaf_prediction(self, x, *, tree=None): if not self.stats: return prediction_option = tree.leaf_prediction if not self.is_active() or prediction_option == tree._MAJORITY_CLASS: dist = normalize_values_in_dict(self.stats, inplace=False) elif prediction_option == tree._NAIVE_BAYES: if self.total_weight >= tree.nb_threshold: dist = do_naive_bayes_prediction(x, self.stats, self.attribute_observers) else: # Use majority class dist = normalize_values_in_dict(self.stats, inplace=False) else: # Naive Bayes Adaptive dist = super().leaf_prediction(x, tree=tree) dist_sum = sum(dist.values()) normalization_factor = dist_sum * self.error_estimation * self.error_estimation # Weight node's responses accordingly to the estimated error monitored by ADWIN # Useful if both the predictions of the alternate tree and the ones from the main tree # are combined -> give preference to the most accurate one dist = normalize_values_in_dict(dist, normalization_factor, inplace=False) return dist # Override AdaNode: enable option vote (query potentially more than one leaf for responses) def filter_instance_to_leaves(self, x, parent, parent_branch, found_nodes): found_nodes.append(FoundNode(self, parent, parent_branch))
def partial_fit(self, X, y, classes=None, sample_weight=None): """ Partially fits the model, based on the X and y matrix. Since it's an ensemble learner, if X and y matrix of more than one sample are passed, the algorithm will partial fit the model one sample at a time. Each sample is trained by each classifier a total of K times, where K is drawn by a :math:`Poisson(l)` distribution. :math:`l` is updated after every example using :math:`lambda_{sc}` if th estimator correctly classifies the example or :math:`lambda_{sw}` in the other case. Parameters ---------- X : numpy.ndarray of shape (n_samples, n_features) The features to train the model. y: numpy.ndarray of shape (n_samples) An array-like with the class labels of all samples in X. classes: numpy.ndarray, optional (default=None) Array with all possible/known class labels. This is an optional parameter, except for the first partial_fit call where it is compulsory. sample_weight: Array-like Instance weight. If not provided, uniform weights are assumed. Usage varies depending on the base estimator. Raises ------ ValueError: A ValueError is raised if the 'classes' parameter is not passed in the first partial_fit call, or if they are passed in further calls but differ from the initial classes list passed. Returns ------- self """ if self.ensemble is None: self.__configure() if self.classes is None: if classes is None: raise ValueError( "The first partial_fit call should pass all the classes.") else: self.classes = classes if self.classes is not None and classes is not None: if set(self.classes) == set(classes): pass else: raise ValueError( "The classes passed to the partial_fit function differ from those passed earlier." ) self.__adjust_ensemble_size() r, _ = get_dimensions(X) for j in range(r): change_detected = False lam = 1 for i in range(self.actual_n_estimators): a = (i + 1) / self.actual_n_estimators if y[j] == 1: self.pos_samples.append(X[j]) lam = a * self.sampling_rate lam_smote = (1 - a) * self.sampling_rate k = self._random_state.poisson(lam) if k > 0: for b in range(k): self.ensemble[i].partial_fit([X[j]], [y[j]], classes, sample_weight) k_smote = self._random_state.poisson(lam_smote) if k_smote > 0: for b in range(k_smote): x_smote = self.online_smote() self.ensemble[i].partial_fit([x_smote], [y[j]], classes, sample_weight) else: k = self._random_state.poisson(lam) if k > 0: for b in range(k): self.ensemble[i].partial_fit([X[j]], [y[j]], classes, sample_weight) if self.drift_detection: try: pred = self.ensemble[i].predict(X) error_estimation = self.adwin_ensemble[i].estimation for k in range(r): if pred[k] is not None: self.adwin_ensemble[i].update( int(pred[k] == y[k])) if self.adwin_ensemble[i].change_detected: if self.adwin_ensemble[ i].estimation > error_estimation: change_detected = True except ValueError: change_detected = False pass if change_detected and self.drift_detection: max_threshold = 0.0 i_max = -1 for i in range(self.actual_n_estimators): if max_threshold < self.adwin_ensemble[i].estimation: max_threshold = self.adwin_ensemble[i].estimation i_max = i if i_max != -1: self.ensemble[i_max].reset() self.adwin_ensemble[i_max] = ADWIN() return self
def __partial_fit(self, X, y): if self.init_matrix_codes and self.enable_code_matrix: self.__init_output_codes() change_detected = False for i in range(self.actual_n_estimators): # leveraging_bag - Leveraging Bagging if self.leverage_algorithm == self._LEVERAGE_ALGORITHMS[0]: k = self._random_state.poisson(self.w) # leveraging_bag_me - Missclassification Error elif self.leverage_algorithm == self._LEVERAGE_ALGORITHMS[1]: error = self.adwin_ensemble[i].estimation pred = self.ensemble[i].predict(np.asarray([X])) if pred is None: k = 1.0 elif pred[0] != y: k = 1.0 elif (error != 1.0 and self._random_state.rand() < (error / (1.0 - error))): k = 1.0 else: k = 0.0 # leveraging_bag_half - Resampling without replacement for # half of the instances elif self.leverage_algorithm == self._LEVERAGE_ALGORITHMS[2]: w = 1.0 k = 0.0 if (self._random_state.randint(2) == 1) else w # leveraging_bag_wt - Without taking out all instances elif self.leverage_algorithm == self._LEVERAGE_ALGORITHMS[3]: w = 1.0 k = 1.0 + self._random_state.poisson(w) # leveraging_subag - Resampling without replacement elif self.leverage_algorithm == self._LEVERAGE_ALGORITHMS[4]: w = 1.0 k = self._random_state.poisson(1) k = w if k > 0 else 0 else: raise RuntimeError("Invalid option for leverage_algorithm: '{}'\n" "Valid options are: {}".format(self.leverage_algorithm, self._LEVERAGE_ALGORITHMS)) y_coded = cp.deepcopy(y) if k > 0: classes = self.classes if self.enable_code_matrix: y_coded = self.matrix_codes[i][int(y)] classes = [0, 1] for _ in range(int(k)): self.ensemble[i].partial_fit(X=np.asarray([X]), y=np.asarray([y_coded]), classes=classes) pred = self.ensemble[i].predict(np.asarray([X])) if pred is not None: add = 0 if (pred[0] == y_coded) else 1 error = self.adwin_ensemble[i].estimation self.adwin_ensemble[i].update(add) if self.adwin_ensemble[i].change_detected: if self.adwin_ensemble[i].estimation > error: change_detected = True if change_detected: self.n_detected_changes += 1 max_threshold = 0.0 i_max = -1 for i in range(self.actual_n_estimators): if max_threshold < self.adwin_ensemble[i].estimation: max_threshold = self.adwin_ensemble[i].estimation i_max = i if i_max != -1: self.ensemble[i_max].reset() self.adwin_ensemble[i_max] = ADWIN(self.delta) return self
def test_adwin(): expected_indices = [1055, 1087, 1151] detected_indices = perform_test(ADWIN(), data_stream_1) assert detected_indices == expected_indices
def __init__( self, # Forest parameters n_models: int = 10, max_features="sqrt", aggregation_method: str = "median", lambda_value: int = 6, metric: metrics.RegressionMetric = metrics.MSE(), disable_weighted_vote=True, drift_detector: base.DriftDetector = ADWIN(0.001), warning_detector: base.DriftDetector = ADWIN(0.01), # Tree parameters grace_period: int = 50, max_depth: int = None, split_confidence: float = 0.01, tie_threshold: float = 0.05, leaf_prediction: str = "model", leaf_model: base.Regressor = None, model_selector_decay: float = 0.95, nominal_attributes: list = None, splitter: Splitter = None, min_samples_split: int = 5, max_size: int = 100, memory_estimate_period: int = 2000000, seed: int = None, **kwargs, ): super().__init__( n_models=n_models, max_features=max_features, lambda_value=lambda_value, metric=metric, disable_weighted_vote=disable_weighted_vote, drift_detector=drift_detector, warning_detector=warning_detector, seed=seed, ) self._n_samples_seen = 0 self._base_member_class = ForestMemberRegressor # Tree parameters self.grace_period = grace_period self.max_depth = max_depth self.split_confidence = split_confidence self.tie_threshold = tie_threshold self.leaf_prediction = leaf_prediction self.leaf_model = leaf_model self.model_selector_decay = model_selector_decay self.nominal_attributes = nominal_attributes self.splitter = splitter self.min_samples_split = min_samples_split self.max_size = max_size self.memory_estimate_period = memory_estimate_period self.kwargs = kwargs if aggregation_method in self._VALID_AGGREGATION_METHOD: self.aggregation_method = aggregation_method else: raise ValueError( f"Invalid aggregation_method: {aggregation_method}.\n" f"Valid values are: {self._VALID_AGGREGATION_METHOD}" )
def __init__(self, model: base.Classifier, n_models=10, seed: int = None): super().__init__(model=model, n_models=n_models, seed=seed) self._drift_detectors = [ copy.deepcopy(ADWIN()) for _ in range(self.n_models) ]
# ADWIN import numpy as np from river.drift import ADWIN np.random.seed(12345) adwin = ADWIN() # Simulate a data stream composed by two data distributions data_stream = np.concatenate( (np.random.randint(2, size=1000), np.random.randint(4, high=8, size=1000))) # Update drift detector and verify if change is detected for i, val in enumerate(data_stream): in_drift, in_warning = adwin.update(val) if in_drift: print(f"Change detected at index {i}, input value: {val}")
class AdaptiveRandomForestRegressor(BaseForest, base.Regressor): r"""Adaptive Random Forest regressor. The 3 most important aspects of Adaptive Random Forest [^1] are: 1. inducing diversity through re-sampling 2. inducing diversity through randomly selecting subsets of features for node splits 3. drift detectors per base tree, which cause selective resets in response to drifts Notice that this implementation is slightly different from the original algorithm proposed in [^2]. The `HoeffdingTreeRegressor` is used as base learner, instead of `FIMT-DD`. It also adds a new strategy to monitor the predictions and check for concept drifts. The deviations of the predictions to the target are monitored and normalized in the [0, 1] range to fulfill ADWIN's requirements. We assume that the data subjected to the normalization follows a normal distribution, and thus, lies within the interval of the mean $\pm3\sigma$. Parameters ---------- n_models Number of trees in the ensemble. max_features Max number of attributes for each node split.<br/> - If `int`, then consider `max_features` at each split.<br/> - If `float`, then `max_features` is a percentage and `int(max_features * n_features)` features are considered per split.<br/> - If "sqrt", then `max_features=sqrt(n_features)`.<br/> - If "log2", then `max_features=log2(n_features)`.<br/> - If None, then ``max_features=n_features``. lambda_value The lambda value for bagging (lambda=6 corresponds to Leveraging Bagging). metric Metric used to track trees performance within the ensemble. Depending, on the configuration, this metric is also used to weight predictions from the members of the ensemble. aggregation_method The method to use to aggregate predictions in the ensemble.<br/> - 'mean'<br/> - 'median' - If selected will disable the weighted vote. disable_weighted_vote If `True`, disables the weighted vote prediction, i.e. does not assign weights to individual tree's predictions and uses the arithmetic mean instead. Otherwise will use the `metric` value to weight predictions. drift_detector Drift Detection method. Set to None to disable Drift detection. warning_detector Warning Detection method. Set to None to disable warning detection. grace_period [*Tree parameter*] Number of instances a leaf should observe between split attempts. max_depth [*Tree parameter*] The maximum depth a tree can reach. If `None`, the tree will grow indefinitely. split_confidence [*Tree parameter*] Allowed error in split decision, a value closer to 0 takes longer to decide. tie_threshold [*Tree parameter*] Threshold below which a split will be forced to break ties. leaf_prediction [*Tree parameter*] Prediction mechanism used at leaves.</br> - 'mean' - Target mean</br> - 'model' - Uses the model defined in `leaf_model`</br> - 'adaptive' - Chooses between 'mean' and 'model' dynamically</br> leaf_model [*Tree parameter*] The regression model used to provide responses if `leaf_prediction='model'`. If not provided, an instance of `river.linear_model.LinearRegression` with the default hyperparameters is used. model_selector_decay The exponential decaying factor applied to the learning models' squared errors, that are monitored if `leaf_prediction='adaptive'`. Must be between `0` and `1`. The closer to `1`, the more importance is going to be given to past observations. On the other hand, if its value approaches `0`, the recent observed errors are going to have more influence on the final decision. nominal_attributes [*Tree parameter*] List of Nominal attributes. If empty, then assume that all attributes are numerical. attr_obs [*Tree parameter*] The attribute observer (AO) used to monitor the target statistics of numeric features and perform splits. Parameters can be passed to the AOs (when supported) by using `attr_obs_params`. Valid options are:</br> - `'e-bst'`: Extended Binary Search Tree (E-BST). This AO has no parameters.</br> See notes for more information about the supported AOs. attr_obs_params [*Tree parameter*] Parameters passed to the numeric AOs. See `attr_obs` for more information. min_samples_split [*Tree parameter*] The minimum number of samples every branch resulting from a split candidate must have to be considered valid. max_size [*Tree parameter*] Maximum memory (MB) consumed by the tree. memory_estimate_period [*Tree parameter*] Number of instances between memory consumption checks. seed If `int`, `seed` is used to seed the random number generator; If `RandomState`, `seed` is the random number generator; If `None`, the random number generator is the `RandomState` instance used by `np.random`. kwargs Other parameters passed to `river.tree.BaseHoeffdingTree`. Notes ----- Hoeffding trees rely on Attribute Observer (AO) algorithms to monitor input features and perform splits. Nominal features can be easily dealt with, since the partitions are well-defined. Numerical features, however, require more sophisticated solutions. Currently, only one AO is supported in `river` for regression trees: - The Extended Binary Search Tree (E-BST) uses an exhaustive algorithm to find split candidates, similarly to batch decision tree algorithms. It ends up storing all observations between split attempts. However, E-BST automatically removes bad split points periodically from its structure and, thus, alleviates the memory and time costs involved in its usage. References ---------- [^1]: Gomes, H.M., Bifet, A., Read, J., Barddal, J.P., Enembreck, F., Pfharinger, B., Holmes, G. and Abdessalem, T., 2017. Adaptive random forests for evolving data stream classification. Machine Learning, 106(9-10), pp.1469-1495. [^2]: Gomes, H.M., Barddal, J.P., Boiko, L.E., Bifet, A., 2018. Adaptive random forests for data stream regression. ESANN 2018. Examples -------- >>> from river import datasets >>> from river import evaluate >>> from river import metrics >>> from river import ensemble >>> from river import preprocessing >>> dataset = datasets.TrumpApproval() >>> model = ( ... preprocessing.StandardScaler() | ... ensemble.AdaptiveRandomForestRegressor(n_models=3, seed=42) ... ) >>> metric = metrics.MAE() >>> evaluate.progressive_val_score(dataset, model, metric) MAE: 1.870913 """ _MEAN = "mean" _MEDIAN = "median" _VALID_AGGREGATION_METHOD = [_MEAN, _MEDIAN] def __init__( self, # Forest parameters n_models: int = 10, max_features="sqrt", aggregation_method: str = "median", lambda_value: int = 6, <<<<<<< HEAD metric: RegressionMetric = MSE(), ======= metric: metrics.RegressionMetric = metrics.MSE(), >>>>>>> upstream/master disable_weighted_vote=True, drift_detector: base.DriftDetector = ADWIN(0.001), warning_detector: base.DriftDetector = ADWIN(0.01), # Tree parameters grace_period: int = 50, max_depth: int = None, split_confidence: float = 0.01, tie_threshold: float = 0.05, leaf_prediction: str = "model", leaf_model: base.Regressor = None, model_selector_decay: float = 0.95, nominal_attributes: list = None, attr_obs: str = "e-bst", attr_obs_params: dict = None, min_samples_split: int = 5, max_size: int = 100, memory_estimate_period: int = 2000000, seed: int = None, **kwargs, ): super().__init__( n_models=n_models, max_features=max_features, lambda_value=lambda_value, metric=metric, disable_weighted_vote=disable_weighted_vote, drift_detector=drift_detector, warning_detector=warning_detector, seed=seed, ) self._n_samples_seen = 0 self._base_member_class = ForestMemberRegressor # Tree parameters self.grace_period = grace_period self.max_depth = max_depth self.split_confidence = split_confidence self.tie_threshold = tie_threshold self.leaf_prediction = leaf_prediction self.leaf_model = leaf_model self.model_selector_decay = model_selector_decay self.nominal_attributes = nominal_attributes self.attr_obs = attr_obs self.attr_obs_params = attr_obs_params self.min_samples_split = min_samples_split self.max_size = max_size self.memory_estimate_period = memory_estimate_period self.kwargs = kwargs if aggregation_method in self._VALID_AGGREGATION_METHOD: self.aggregation_method = aggregation_method else: raise ValueError( f"Invalid aggregation_method: {aggregation_method}.\n" f"Valid values are: {self._VALID_AGGREGATION_METHOD}" )