def adwin(data): adwin = ADWIN() i=0 val=0 print(data) drifts = [] for row in data: in_drift, in_warning = adwin.update(row['count']) if in_drift: print(f"Change detected at index {row['date']}, input value: {row['count']}") drifts.append({'date':row['date'],'count':row['count']}) return drifts
def demo(): """ _test_adwin In this demo, an ADWIN object evaluates a sequence of numbers corresponding to 2 distributions. The ADWIN object indicates the indices where change is detected. The first half of the data is a sequence of randomly generated 0's and 1's. The second half of the data is a normal distribution of integers from 0 to 7. """ adwin = ADWIN() size = 2000 change_start = 999 np.random.seed(1) data_stream = np.random.randint(2, size=size) data_stream[change_start:] = np.random.randint(8, size=size - change_start) for i in range(size): change_detected, _ = adwin.update(data_stream[i]) if change_detected: print('Change has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i))
# ADWIN import numpy as np from river.drift import ADWIN np.random.seed(12345) adwin = ADWIN() # Simulate a data stream composed by two data distributions data_stream = np.concatenate( (np.random.randint(2, size=1000), np.random.randint(4, high=8, size=1000))) # Update drift detector and verify if change is detected for i, val in enumerate(data_stream): in_drift, in_warning = adwin.update(val) if in_drift: print(f"Change detected at index {i}, input value: {val}")
class AdaLearningNodeClassifier(LearningNodeNBA, AdaNode): """Learning node for Hoeffding Adaptive Tree. Parameters ---------- stats Initial class observations. depth The depth of the learning node in the tree. attr_obs The numeric attribute observer algorithm used to monitor target statistics and perform split attempts. attr_obs_params The parameters passed to the numeric attribute observer algorithm. adwin_delta The delta parameter of ADWIN. seed Seed to control the generation of random numbers and support reproducibility. """ def __init__(self, stats, depth, attr_obs, attr_obs_params, adwin_delta, seed): super().__init__(stats, depth, attr_obs, attr_obs_params) self.adwin_delta = adwin_delta self._adwin = ADWIN(delta=self.adwin_delta) self.error_change = False self._rng = check_random_state(seed) @property def n_leaves(self): return 1 @property def error_estimation(self): return self._adwin.estimation @property def error_width(self): return self._adwin.width def error_is_null(self): return self._adwin is None def kill_tree_children(self, hat): pass def learn_one(self, x, y, *, sample_weight=1., tree=None, parent=None, parent_branch=-1): if tree.bootstrap_sampling: # Perform bootstrap-sampling k = self._rng.poisson(1.0) if k > 0: sample_weight = sample_weight * k aux = self.leaf_prediction(x, tree=tree) class_prediction = max(aux, key=aux.get) if aux else None is_correct = (y == class_prediction) if self._adwin is None: self._adwin = ADWIN(delta=self.adwin_delta) old_error = self.error_estimation # Update ADWIN self.error_change, _ = self._adwin.update(int(not is_correct)) # Error is decreasing if self.error_change and old_error > self.error_estimation: self.error_change = False # Update statistics super().learn_one(x, y, sample_weight=sample_weight, tree=tree) weight_seen = self.total_weight if weight_seen - self.last_split_attempt_at >= tree.grace_period: if self.depth >= tree.max_depth: # Depth-based pre-pruning self.deactivate() tree._n_inactive_leaves += 1 tree._n_active_leaves -= 1 else: tree._attempt_to_split(self, parent, parent_branch) self.last_split_attempt_at = weight_seen # Override LearningNodeNBA def leaf_prediction(self, x, *, tree=None): if not self.stats: return prediction_option = tree.leaf_prediction if not self.is_active() or prediction_option == tree._MAJORITY_CLASS: dist = normalize_values_in_dict(self.stats, inplace=False) elif prediction_option == tree._NAIVE_BAYES: if self.total_weight >= tree.nb_threshold: dist = do_naive_bayes_prediction(x, self.stats, self.attribute_observers) else: # Use majority class dist = normalize_values_in_dict(self.stats, inplace=False) else: # Naive Bayes Adaptive dist = super().leaf_prediction(x, tree=tree) dist_sum = sum(dist.values()) normalization_factor = dist_sum * self.error_estimation * self.error_estimation # Weight node's responses accordingly to the estimated error monitored by ADWIN # Useful if both the predictions of the alternate tree and the ones from the main tree # are combined -> give preference to the most accurate one dist = normalize_values_in_dict(dist, normalization_factor, inplace=False) return dist # Override AdaNode: enable option vote (query potentially more than one leaf for responses) def filter_instance_to_leaves(self, x, parent, parent_branch, found_nodes): found_nodes.append(FoundNode(self, parent, parent_branch))
class AdaSplitNodeClassifier(SplitNode, AdaNode): """Node that splits the data in a Hoeffding Adaptive Tree. Parameters ---------- split_test Split test. stats Class observations depth The depth of the node. adwin_delta The delta parameter of ADWIN. seed Internal random state used to sample from poisson distributions. """ def __init__(self, split_test, stats, depth, adwin_delta, seed): super().__init__(split_test, stats, depth) self.adwin_delta = adwin_delta self._adwin = ADWIN(delta=self.adwin_delta) self._alternate_tree = None self._error_change = False self._rng = check_random_state(seed) @property def n_leaves(self): num_of_leaves = 0 for child in self._children.values(): if child is not None: num_of_leaves += child.n_leaves return num_of_leaves @property def error_estimation(self): return self._adwin.estimation @property def error_width(self): w = 0.0 if not self.error_is_null(): w = self._adwin.width return w def error_is_null(self): return self._adwin is None def learn_one(self, x, y, *, sample_weight=1., tree=None, parent=None, parent_branch=-1): class_prediction = None leaf = self.filter_instance_to_leaf(x, parent, parent_branch) if leaf.node is not None: aux = leaf.node.leaf_prediction(x, tree=tree) class_prediction = max(aux, key=aux.get) if aux else None is_correct = (y == class_prediction) # Update stats as traverse the tree to improve predictions (in case split nodes are used # to provide responses) try: self.stats[y] += sample_weight except KeyError: self.stats[y] = sample_weight if self._adwin is None: self._adwin = ADWIN(self.adwin_delta) old_error = self.error_estimation # Update ADWIN self._error_change, _ = self._adwin.update(int(not is_correct)) # Classification error is decreasing: skip drift adaptation if self._error_change and old_error > self.error_estimation: self._error_change = False # Condition to build a new alternate tree if self._error_change: self._alternate_tree = tree._new_learning_node(parent=self) self._alternate_tree.depth -= 1 # To ensure we do not skip a tree level tree._n_alternate_trees += 1 # Condition to replace alternate tree elif self._alternate_tree is not None and not self._alternate_tree.error_is_null(): if self.error_width > tree.drift_window_threshold \ and self._alternate_tree.error_width > tree.drift_window_threshold: old_error_rate = self.error_estimation alt_error_rate = self._alternate_tree.error_estimation f_delta = .05 f_n = 1.0 / self._alternate_tree.error_width + 1.0 / self.error_width bound = math.sqrt(2.0 * old_error_rate * (1.0 - old_error_rate) * math.log(2.0 / f_delta) * f_n) if bound < (old_error_rate - alt_error_rate): tree._n_active_leaves -= self.n_leaves tree._n_active_leaves += self._alternate_tree.n_leaves self.kill_tree_children(tree) if parent is not None: parent.set_child(parent_branch, self._alternate_tree) self._alternate_tree = None else: # Switch tree root tree._tree_root = tree._tree_root._alternate_tree tree._n_switch_alternate_trees += 1 elif bound < alt_error_rate - old_error_rate: if not self._alternate_tree.is_leaf(): self._alternate_tree.kill_tree_children(tree) self._alternate_tree = None tree._n_pruned_alternate_trees += 1 # Learn one sample in alternate tree and child nodes if self._alternate_tree is not None: self._alternate_tree.learn_one(x, y, sample_weight=sample_weight, tree=tree, parent=parent, parent_branch=parent_branch) child_branch = self.instance_child_index(x) child = self.get_child(child_branch) if child is not None: child.learn_one(x, y, sample_weight=sample_weight, tree=tree, parent=self, parent_branch=child_branch) elif self.split_test.branch_for_instance(x) == -1: split_feat = self.split_test.attrs_test_depends_on()[0] # Instance contains a categorical value previously unseen by the split node if self.split_test.max_branches() == -1 and split_feat in x: # Creates a new learning node to encompass the new observed feature value leaf_node = tree._new_learning_node(parent=self) branch_id = self.split_test.add_new_branch(x[split_feat]) self.set_child(branch_id, leaf_node) tree._n_active_leaves += 1 leaf_node.learn_one(x, y, sample_weight=sample_weight, tree=tree, parent=self, parent_branch=branch_id) # The split feature is missing in the instance. Hence, we pass the new example # to the most traversed path in the current subtree else: path = max( self._children, key=lambda c: self._children[c].total_weight if self._children[c] else 0. ) leaf_node = self.get_child(path) # Pass instance to the most traversed path if leaf_node is None: leaf_node = tree._new_learning_node(parent=self) self.set_child(path, leaf_node) tree._n_active_leaves += 1 leaf_node.learn_one(x, y, sample_weight=sample_weight, tree=tree, parent=self, parent_branch=path) def leaf_prediction(self, x, *, tree=None): # In case split nodes end up being used (if emerging categorical feature appears, # for instance) use the MC (majority class) prediction strategy return normalize_values_in_dict(self.stats, inplace=False) # Override AdaNode def kill_tree_children(self, tree): for child_id, child in self._children.items(): if child is not None: # Delete alternate tree if it exists if not child.is_leaf(): if child._alternate_tree is not None: child._alternate_tree.kill_tree_children(tree) tree._n_pruned_alternate_trees += 1 child._alternate_tree = None # Recursive delete of SplitNodes child.kill_tree_children(tree) tree._n_decision_nodes -= 1 else: if child.is_active(): tree._n_active_leaves -= 1 else: tree._n_inactive_leaves -= 1 self._children[child_id] = None # override AdaNode def filter_instance_to_leaves(self, x, parent, parent_branch, found_nodes): child_index = self.instance_child_index(x) if child_index >= 0: child = self.get_child(child_index) if child is not None: child.filter_instance_to_leaves(x, parent, parent_branch, found_nodes) else: found_nodes.append(FoundNode(None, self, child_index)) else: # Emerging value in a categorical feature appears or the split feature is missing from # the instance: use parent node in both cases found_nodes.append(FoundNode(None, self, child_index)) if self._alternate_tree is not None: self._alternate_tree.filter_instance_to_leaves(x, self, -999, found_nodes)
class KNNADWINClassifier(KNNClassifier): """K-Nearest Neighbors classifier with ADWIN change detector. This classifier is an improvement from the regular kNN method, as it is resistant to concept drift. It uses the `ADWIN` change detector to decide which samples to keep and which ones to forget, and by doing so it regulates the sample window size. Parameters ---------- n_neighbors The number of nearest neighbors to search for. window_size The maximum size of the window storing the last viewed samples. leaf_size The maximum number of samples that can be stored in one leaf node, which determines from which point the algorithm will switch for a brute-force approach. The bigger this number the faster the tree construction time, but the slower the query time will be. p p-norm value for the Minkowski metric. When `p=1`, this corresponds to the Manhattan distance, while `p=2` corresponds to the Euclidean distance. Valid values are in the interval $[1, +\\infty)$ Notes ----- - This estimator is not optimal for a mixture of categorical and numerical features. This implementation treats all features from a given stream as numerical. - This implementation is extended from the KNNClassifier, with the main difference that it keeps a dynamic window whose size changes in agreement with the amount of change detected by the ADWIN drift detector. Examples -------- >>> from river import synth >>> from river import evaluate >>> from river import metrics >>> from river import neighbors >>> dataset = synth.ConceptDriftStream(position=500, width=20, seed=1).take(1000) >>> model = neighbors.KNNADWINClassifier(window_size=100) >>> metric = metrics.Accuracy() >>> evaluate.progressive_val_score(dataset, model, metric) Accuracy: 57.36% """ def __init__(self, n_neighbors=5, window_size=1000, leaf_size=30, p=2): super().__init__(n_neighbors=n_neighbors, window_size=window_size, leaf_size=leaf_size, p=p) self.adwin = ADWIN() def _unit_test_skips(self): return {"check_emerging_features", "check_disappearing_features"} def learn_one(self, x, y): """Update the model with a set of features `x` and a label `y`. Parameters ---------- x A dictionary of features. y The class label. Returns ------- self Notes ----- For the K-Nearest Neighbors Classifier, fitting the model is the equivalent of inserting the newer samples in the observed window, and if the size_limit is reached, removing older results. """ self.classes_.add(y) self.data_window.append(dict2numpy(x), y) if self.data_window.size >= self.n_neighbors: correctly_classifies = int(self.predict_one(x) == y) self.adwin.update(correctly_classifies) else: self.adwin.update(0) if self.data_window.size >= self.n_neighbors: if self.adwin.change_detected: if self.adwin.width < self.data_window.size: for i in range(self.data_window.size, self.adwin.width, -1): self.data_window.popleft() return self
class AdaBranchClassifier(HTBranch, AdaNode): """Node that splits the data in a Hoeffding Adaptive Tree. Parameters ---------- stats Class observations adwin_delta The delta parameter of ADWIN. seed Internal random state used to sample from poisson distributions. children Sequence of children nodes of this branch. attributes Other parameters passed to the split node. """ def __init__(self, stats, *children, adwin_delta, seed, **attributes): super().__init__(stats, *children, **attributes) self.adwin_delta = adwin_delta self._adwin = ADWIN(delta=self.adwin_delta) self._alternate_tree = None self._error_change = False self._rng = check_random_state(seed) def traverse(self, x, until_leaf=True) -> typing.List[HTLeaf]: """Return the leaves corresponding to the given input. Alternate subtree leaves are also included. Parameters ---------- x The input instance. until_leaf Whether or not branch nodes can be returned in case of missing features or emerging feature categories. """ found_nodes = [] for node in self.walk(x, until_leaf=until_leaf): if (isinstance(node, AdaBranchClassifier) and node._alternate_tree is not None): if isinstance(node._alternate_tree, AdaBranchClassifier): found_nodes.append( node._alternate_tree.traverse(x, until_leaf=until_leaf)) else: found_nodes.append(node._alternate_tree) found_nodes.append(node) return found_nodes def iter_leaves(self): """Iterate over leaves from the left-most one to the right-most one. Overrides the base implementation by also including alternate subtrees. """ for child in self.children: yield from child.iter_leaves() if (isinstance(child, AdaBranchClassifier) and child._alternate_tree is not None): yield from child._alternate_tree.iter_leaves() @property def error_estimation(self): return self._adwin.estimation @property def error_width(self): w = 0.0 if not self.error_is_null(): w = self._adwin.width return w def error_is_null(self): return self._adwin is None def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_branch=None): leaf = super().traverse(x, until_leaf=True) aux = leaf.prediction(x, tree=tree) class_prediction = max(aux, key=aux.get) if aux else None is_correct = y == class_prediction # Update stats as traverse the tree to improve predictions (in case split nodes are used # to provide responses) try: self.stats[y] += sample_weight except KeyError: self.stats[y] = sample_weight if self._adwin is None: self._adwin = ADWIN(self.adwin_delta) old_error = self.error_estimation # Update ADWIN self._error_change, _ = self._adwin.update(int(not is_correct)) # Classification error is decreasing: skip drift adaptation if self._error_change and old_error > self.error_estimation: self._error_change = False # Condition to build a new alternate tree if self._error_change: self._alternate_tree = tree._new_leaf(parent=self) self._alternate_tree.depth -= 1 # To ensure we do not skip a tree level tree._n_alternate_trees += 1 # Condition to replace alternate tree elif (self._alternate_tree is not None and not self._alternate_tree.error_is_null()): if (self.error_width > tree.drift_window_threshold and self._alternate_tree.error_width > tree.drift_window_threshold): old_error_rate = self.error_estimation alt_error_rate = self._alternate_tree.error_estimation f_delta = 0.05 f_n = 1.0 / self._alternate_tree.error_width + 1.0 / self.error_width bound = math.sqrt(2.0 * old_error_rate * (1.0 - old_error_rate) * math.log(2.0 / f_delta) * f_n) if bound < (old_error_rate - alt_error_rate): tree._n_active_leaves -= self.n_leaves tree._n_active_leaves += self._alternate_tree.n_leaves self.kill_tree_children(tree) if parent is not None: parent.children[parent_branch] = self._alternate_tree self._alternate_tree = None else: # Switch tree root tree._root = tree._root._alternate_tree tree._n_switch_alternate_trees += 1 elif bound < alt_error_rate - old_error_rate: if isinstance(self._alternate_tree, HTBranch): self._alternate_tree.kill_tree_children(tree) # noqa self._alternate_tree = None tree._n_pruned_alternate_trees += 1 # Learn one sample in alternate tree and child nodes if self._alternate_tree is not None: self._alternate_tree.learn_one( x, y, sample_weight=sample_weight, tree=tree, parent=parent, parent_branch=parent_branch, ) try: child = self.next(x) except KeyError: child = None if child is not None: child.learn_one( x, y, sample_weight=sample_weight, tree=tree, parent=self, parent_branch=self.branch_no(x), ) else: # Instance contains a categorical value previously unseen by the split node if self.max_branches() == -1 and self.feature in x: # noqa # Creates a new learning node to encompass the new observed feature value leaf = tree._new_leaf(parent=self) self.add_child(x[self.feature], leaf) # noqa tree._n_active_leaves += 1 leaf.learn_one( x, y, sample_weight=sample_weight, tree=tree, parent=self, parent_branch=self.branch_no(x), ) # The split feature is missing in the instance. Hence, we pass the new example # to the most traversed path in the current subtree else: child_id, child = self.most_common_path() child.learn_one( x, y, sample_weight=sample_weight, tree=tree, parent=self, parent_branch=child_id, ) # Override AdaNode def kill_tree_children(self, tree): for child in self.children: # Delete alternate tree if it exists if isinstance(child, HTBranch): if child._alternate_tree is not None: child._alternate_tree.kill_tree_children(tree) tree._n_pruned_alternate_trees += 1 child._alternate_tree = None # Recursive delete of SplitNodes child.kill_tree_children(tree) # noqa else: if child.is_active(): # noqa tree._n_active_leaves -= 1 else: tree._n_inactive_leaves -= 1