Example #1
0
def adwin(data):
    adwin = ADWIN()
    i=0
    val=0
    print(data)
    drifts = []
    for row in data:
        in_drift, in_warning = adwin.update(row['count'])
        if in_drift:
            print(f"Change detected at index {row['date']}, input value: {row['count']}")
            drifts.append({'date':row['date'],'count':row['count']})
    return drifts
    
Example #2
0
def demo():
    """ _test_adwin

    In this demo, an ADWIN object evaluates a sequence of numbers corresponding to 2 distributions.
    The ADWIN object indicates the indices where change is detected.

    The first half of the data is a sequence of randomly generated 0's and 1's.
    The second half of the data is a normal distribution of integers from 0 to 7.

    """
    adwin = ADWIN()
    size = 2000
    change_start = 999
    np.random.seed(1)
    data_stream = np.random.randint(2, size=size)
    data_stream[change_start:] = np.random.randint(8, size=size - change_start)

    for i in range(size):
        change_detected, _ = adwin.update(data_stream[i])
        if change_detected:
            print('Change has been detected in data: ' + str(data_stream[i]) +
                  ' - of index: ' + str(i))
# ADWIN

import numpy as np
from river.drift import ADWIN
np.random.seed(12345)

adwin = ADWIN()

# Simulate a data stream composed by two data distributions
data_stream = np.concatenate(
    (np.random.randint(2, size=1000), np.random.randint(4, high=8, size=1000)))

# Update drift detector and verify if change is detected
for i, val in enumerate(data_stream):
    in_drift, in_warning = adwin.update(val)
    if in_drift:
        print(f"Change detected at index {i}, input value: {val}")
Example #4
0
class AdaLearningNodeClassifier(LearningNodeNBA, AdaNode):
    """Learning node for Hoeffding Adaptive Tree.

    Parameters
    ----------
    stats
        Initial class observations.
    depth
        The depth of the learning node in the tree.
    attr_obs
        The numeric attribute observer algorithm used to monitor target statistics
        and perform split attempts.
    attr_obs_params
        The parameters passed to the numeric attribute observer algorithm.
    adwin_delta
        The delta parameter of ADWIN.
    seed
        Seed to control the generation of random numbers and support reproducibility.
    """
    def __init__(self, stats, depth, attr_obs, attr_obs_params, adwin_delta, seed):
        super().__init__(stats, depth, attr_obs, attr_obs_params)
        self.adwin_delta = adwin_delta
        self._adwin = ADWIN(delta=self.adwin_delta)
        self.error_change = False
        self._rng = check_random_state(seed)

    @property
    def n_leaves(self):
        return 1

    @property
    def error_estimation(self):
        return self._adwin.estimation

    @property
    def error_width(self):
        return self._adwin.width

    def error_is_null(self):
        return self._adwin is None

    def kill_tree_children(self, hat):
        pass

    def learn_one(self, x, y, *, sample_weight=1., tree=None, parent=None, parent_branch=-1):
        if tree.bootstrap_sampling:
            # Perform bootstrap-sampling
            k = self._rng.poisson(1.0)
            if k > 0:
                sample_weight = sample_weight * k

        aux = self.leaf_prediction(x, tree=tree)
        class_prediction = max(aux, key=aux.get) if aux else None

        is_correct = (y == class_prediction)

        if self._adwin is None:
            self._adwin = ADWIN(delta=self.adwin_delta)

        old_error = self.error_estimation

        # Update ADWIN
        self.error_change, _ = self._adwin.update(int(not is_correct))

        # Error is decreasing
        if self.error_change and old_error > self.error_estimation:
            self.error_change = False

        # Update statistics
        super().learn_one(x, y, sample_weight=sample_weight, tree=tree)

        weight_seen = self.total_weight

        if weight_seen - self.last_split_attempt_at >= tree.grace_period:
            if self.depth >= tree.max_depth:
                # Depth-based pre-pruning
                self.deactivate()
                tree._n_inactive_leaves += 1
                tree._n_active_leaves -= 1
            else:
                tree._attempt_to_split(self, parent, parent_branch)
                self.last_split_attempt_at = weight_seen

    # Override LearningNodeNBA
    def leaf_prediction(self, x, *, tree=None):
        if not self.stats:
            return

        prediction_option = tree.leaf_prediction
        if not self.is_active() or prediction_option == tree._MAJORITY_CLASS:
            dist = normalize_values_in_dict(self.stats, inplace=False)
        elif prediction_option == tree._NAIVE_BAYES:
            if self.total_weight >= tree.nb_threshold:
                dist = do_naive_bayes_prediction(x, self.stats, self.attribute_observers)
            else:  # Use majority class
                dist = normalize_values_in_dict(self.stats, inplace=False)
        else:  # Naive Bayes Adaptive
            dist = super().leaf_prediction(x, tree=tree)

        dist_sum = sum(dist.values())
        normalization_factor = dist_sum * self.error_estimation * self.error_estimation

        # Weight node's responses accordingly to the estimated error monitored by ADWIN
        # Useful if both the predictions of the alternate tree and the ones from the main tree
        # are combined -> give preference to the most accurate one
        dist = normalize_values_in_dict(dist, normalization_factor, inplace=False)

        return dist

    # Override AdaNode: enable option vote (query potentially more than one leaf for responses)
    def filter_instance_to_leaves(self, x, parent, parent_branch, found_nodes):
        found_nodes.append(FoundNode(self, parent, parent_branch))
Example #5
0
class AdaSplitNodeClassifier(SplitNode, AdaNode):
    """Node that splits the data in a Hoeffding Adaptive Tree.

    Parameters
    ----------
    split_test
        Split test.
    stats
        Class observations
    depth
        The depth of the node.
    adwin_delta
        The delta parameter of ADWIN.
    seed
        Internal random state used to sample from poisson distributions.
    """
    def __init__(self, split_test, stats, depth, adwin_delta, seed):
        super().__init__(split_test, stats, depth)
        self.adwin_delta = adwin_delta
        self._adwin = ADWIN(delta=self.adwin_delta)
        self._alternate_tree = None
        self._error_change = False

        self._rng = check_random_state(seed)

    @property
    def n_leaves(self):
        num_of_leaves = 0
        for child in self._children.values():
            if child is not None:
                num_of_leaves += child.n_leaves

        return num_of_leaves

    @property
    def error_estimation(self):
        return self._adwin.estimation

    @property
    def error_width(self):
        w = 0.0
        if not self.error_is_null():
            w = self._adwin.width

        return w

    def error_is_null(self):
        return self._adwin is None

    def learn_one(self, x, y, *, sample_weight=1., tree=None, parent=None, parent_branch=-1):
        class_prediction = None

        leaf = self.filter_instance_to_leaf(x, parent, parent_branch)
        if leaf.node is not None:
            aux = leaf.node.leaf_prediction(x, tree=tree)
            class_prediction = max(aux, key=aux.get) if aux else None

        is_correct = (y == class_prediction)

        # Update stats as traverse the tree to improve predictions (in case split nodes are used
        # to provide responses)
        try:
            self.stats[y] += sample_weight
        except KeyError:
            self.stats[y] = sample_weight

        if self._adwin is None:
            self._adwin = ADWIN(self.adwin_delta)

        old_error = self.error_estimation

        # Update ADWIN
        self._error_change, _ = self._adwin.update(int(not is_correct))

        # Classification error is decreasing: skip drift adaptation
        if self._error_change and old_error > self.error_estimation:
            self._error_change = False

        # Condition to build a new alternate tree
        if self._error_change:
            self._alternate_tree = tree._new_learning_node(parent=self)
            self._alternate_tree.depth -= 1  # To ensure we do not skip a tree level
            tree._n_alternate_trees += 1
        # Condition to replace alternate tree
        elif self._alternate_tree is not None and not self._alternate_tree.error_is_null():
            if self.error_width > tree.drift_window_threshold \
                    and self._alternate_tree.error_width > tree.drift_window_threshold:
                old_error_rate = self.error_estimation
                alt_error_rate = self._alternate_tree.error_estimation
                f_delta = .05
                f_n = 1.0 / self._alternate_tree.error_width + 1.0 / self.error_width

                bound = math.sqrt(2.0 * old_error_rate * (1.0 - old_error_rate) *
                                  math.log(2.0 / f_delta) * f_n)
                if bound < (old_error_rate - alt_error_rate):
                    tree._n_active_leaves -= self.n_leaves
                    tree._n_active_leaves += self._alternate_tree.n_leaves
                    self.kill_tree_children(tree)

                    if parent is not None:
                        parent.set_child(parent_branch, self._alternate_tree)
                        self._alternate_tree = None
                    else:
                        # Switch tree root
                        tree._tree_root = tree._tree_root._alternate_tree
                    tree._n_switch_alternate_trees += 1
                elif bound < alt_error_rate - old_error_rate:
                    if not self._alternate_tree.is_leaf():
                        self._alternate_tree.kill_tree_children(tree)
                    self._alternate_tree = None
                    tree._n_pruned_alternate_trees += 1

        # Learn one sample in alternate tree and child nodes
        if self._alternate_tree is not None:
            self._alternate_tree.learn_one(x, y, sample_weight=sample_weight, tree=tree,
                                           parent=parent, parent_branch=parent_branch)
        child_branch = self.instance_child_index(x)
        child = self.get_child(child_branch)
        if child is not None:
            child.learn_one(x, y, sample_weight=sample_weight, tree=tree, parent=self,
                            parent_branch=child_branch)
        elif self.split_test.branch_for_instance(x) == -1:
            split_feat = self.split_test.attrs_test_depends_on()[0]
            # Instance contains a categorical value previously unseen by the split node
            if self.split_test.max_branches() == -1 and split_feat in x:
                # Creates a new learning node to encompass the new observed feature value
                leaf_node = tree._new_learning_node(parent=self)
                branch_id = self.split_test.add_new_branch(x[split_feat])
                self.set_child(branch_id, leaf_node)
                tree._n_active_leaves += 1
                leaf_node.learn_one(x, y, sample_weight=sample_weight, tree=tree, parent=self,
                                    parent_branch=branch_id)
            # The split feature is missing in the instance. Hence, we pass the new example
            # to the most traversed path in the current subtree
            else:
                path = max(
                    self._children,
                    key=lambda c: self._children[c].total_weight if self._children[c] else 0.
                )
                leaf_node = self.get_child(path)
                # Pass instance to the most traversed path
                if leaf_node is None:
                    leaf_node = tree._new_learning_node(parent=self)
                    self.set_child(path, leaf_node)
                    tree._n_active_leaves += 1

                leaf_node.learn_one(x, y, sample_weight=sample_weight, tree=tree, parent=self,
                                    parent_branch=path)

    def leaf_prediction(self, x, *, tree=None):
        # In case split nodes end up being used (if emerging categorical feature appears,
        # for instance) use the MC (majority class) prediction strategy
        return normalize_values_in_dict(self.stats, inplace=False)

    # Override AdaNode
    def kill_tree_children(self, tree):
        for child_id, child in self._children.items():
            if child is not None:
                # Delete alternate tree if it exists
                if not child.is_leaf():
                    if child._alternate_tree is not None:
                        child._alternate_tree.kill_tree_children(tree)
                        tree._n_pruned_alternate_trees += 1
                        child._alternate_tree = None

                    # Recursive delete of SplitNodes
                    child.kill_tree_children(tree)
                    tree._n_decision_nodes -= 1
                else:
                    if child.is_active():
                        tree._n_active_leaves -= 1
                    else:
                        tree._n_inactive_leaves -= 1

                self._children[child_id] = None

    # override AdaNode
    def filter_instance_to_leaves(self, x, parent, parent_branch, found_nodes):
        child_index = self.instance_child_index(x)
        if child_index >= 0:
            child = self.get_child(child_index)
            if child is not None:
                child.filter_instance_to_leaves(x, parent, parent_branch, found_nodes)
            else:
                found_nodes.append(FoundNode(None, self, child_index))
        else:
            # Emerging value in a categorical feature appears or the split feature is missing from
            # the instance: use parent node in both cases
            found_nodes.append(FoundNode(None, self, child_index))

        if self._alternate_tree is not None:
            self._alternate_tree.filter_instance_to_leaves(x, self, -999, found_nodes)
Example #6
0
class KNNADWINClassifier(KNNClassifier):
    """K-Nearest Neighbors classifier with ADWIN change detector.

    This classifier is an improvement from the regular kNN method,
    as it is resistant to concept drift. It uses the `ADWIN` change
    detector to decide which samples to keep and which ones to forget,
    and by doing so it regulates the sample window size.

    Parameters
    ----------
    n_neighbors
        The number of nearest neighbors to search for.
    window_size
        The maximum size of the window storing the last viewed samples.
    leaf_size
        The maximum number of samples that can be stored in one leaf node,
        which determines from which point the algorithm will switch for a
        brute-force approach. The bigger this number the faster the tree
        construction time, but the slower the query time will be.
    p
        p-norm value for the Minkowski metric. When `p=1`, this corresponds to the
        Manhattan distance, while `p=2` corresponds to the Euclidean distance. Valid
        values are in the interval $[1, +\\infty)$

    Notes
    -----
    - This estimator is not optimal for a mixture of categorical and numerical
    features. This implementation treats all features from a given stream as
    numerical.
    - This implementation is extended from the KNNClassifier, with the main
    difference that it keeps a dynamic window whose size changes in agreement
    with the amount of change detected by the ADWIN drift detector.

    Examples
    --------
    >>> from river import synth
    >>> from river import evaluate
    >>> from river import metrics
    >>> from river import neighbors

    >>> dataset = synth.ConceptDriftStream(position=500, width=20, seed=1).take(1000)

    >>> model = neighbors.KNNADWINClassifier(window_size=100)

    >>> metric = metrics.Accuracy()

    >>> evaluate.progressive_val_score(dataset, model, metric)
    Accuracy: 57.36%

    """
    def __init__(self, n_neighbors=5, window_size=1000, leaf_size=30, p=2):
        super().__init__(n_neighbors=n_neighbors,
                         window_size=window_size,
                         leaf_size=leaf_size,
                         p=p)
        self.adwin = ADWIN()

    def _unit_test_skips(self):
        return {"check_emerging_features", "check_disappearing_features"}

    def learn_one(self, x, y):
        """Update the model with a set of features `x` and a label `y`.

        Parameters
        ----------
        x
            A dictionary of features.
        y
            The class label.

        Returns
        -------
            self

        Notes
        -----
        For the K-Nearest Neighbors Classifier, fitting the model is the
        equivalent of inserting the newer samples in the observed window,
        and if the size_limit is reached, removing older results.

        """
        self.classes_.add(y)

        self.data_window.append(dict2numpy(x), y)
        if self.data_window.size >= self.n_neighbors:
            correctly_classifies = int(self.predict_one(x) == y)
            self.adwin.update(correctly_classifies)
        else:
            self.adwin.update(0)

        if self.data_window.size >= self.n_neighbors:
            if self.adwin.change_detected:
                if self.adwin.width < self.data_window.size:
                    for i in range(self.data_window.size, self.adwin.width,
                                   -1):
                        self.data_window.popleft()
        return self
Example #7
0
class AdaBranchClassifier(HTBranch, AdaNode):
    """Node that splits the data in a Hoeffding Adaptive Tree.

    Parameters
    ----------
    stats
        Class observations
    adwin_delta
        The delta parameter of ADWIN.
    seed
        Internal random state used to sample from poisson distributions.
    children
        Sequence of children nodes of this branch.
    attributes
        Other parameters passed to the split node.
    """
    def __init__(self, stats, *children, adwin_delta, seed, **attributes):
        super().__init__(stats, *children, **attributes)
        self.adwin_delta = adwin_delta
        self._adwin = ADWIN(delta=self.adwin_delta)
        self._alternate_tree = None
        self._error_change = False

        self._rng = check_random_state(seed)

    def traverse(self, x, until_leaf=True) -> typing.List[HTLeaf]:
        """Return the leaves corresponding to the given input.

        Alternate subtree leaves are also included.

        Parameters
        ----------
        x
            The input instance.
        until_leaf
            Whether or not branch nodes can be returned in case of missing features or emerging
            feature categories.
        """
        found_nodes = []
        for node in self.walk(x, until_leaf=until_leaf):
            if (isinstance(node, AdaBranchClassifier)
                    and node._alternate_tree is not None):
                if isinstance(node._alternate_tree, AdaBranchClassifier):
                    found_nodes.append(
                        node._alternate_tree.traverse(x,
                                                      until_leaf=until_leaf))
                else:
                    found_nodes.append(node._alternate_tree)

        found_nodes.append(node)
        return found_nodes

    def iter_leaves(self):
        """Iterate over leaves from the left-most one to the right-most one.

        Overrides the base implementation by also including alternate subtrees.
        """
        for child in self.children:
            yield from child.iter_leaves()

            if (isinstance(child, AdaBranchClassifier)
                    and child._alternate_tree is not None):
                yield from child._alternate_tree.iter_leaves()

    @property
    def error_estimation(self):
        return self._adwin.estimation

    @property
    def error_width(self):
        w = 0.0
        if not self.error_is_null():
            w = self._adwin.width

        return w

    def error_is_null(self):
        return self._adwin is None

    def learn_one(self,
                  x,
                  y,
                  *,
                  sample_weight=1.0,
                  tree=None,
                  parent=None,
                  parent_branch=None):
        leaf = super().traverse(x, until_leaf=True)
        aux = leaf.prediction(x, tree=tree)
        class_prediction = max(aux, key=aux.get) if aux else None
        is_correct = y == class_prediction

        # Update stats as traverse the tree to improve predictions (in case split nodes are used
        # to provide responses)
        try:
            self.stats[y] += sample_weight
        except KeyError:
            self.stats[y] = sample_weight

        if self._adwin is None:
            self._adwin = ADWIN(self.adwin_delta)

        old_error = self.error_estimation

        # Update ADWIN
        self._error_change, _ = self._adwin.update(int(not is_correct))

        # Classification error is decreasing: skip drift adaptation
        if self._error_change and old_error > self.error_estimation:
            self._error_change = False

        # Condition to build a new alternate tree
        if self._error_change:
            self._alternate_tree = tree._new_leaf(parent=self)
            self._alternate_tree.depth -= 1  # To ensure we do not skip a tree level
            tree._n_alternate_trees += 1
        # Condition to replace alternate tree
        elif (self._alternate_tree is not None
              and not self._alternate_tree.error_is_null()):
            if (self.error_width > tree.drift_window_threshold
                    and self._alternate_tree.error_width >
                    tree.drift_window_threshold):
                old_error_rate = self.error_estimation
                alt_error_rate = self._alternate_tree.error_estimation
                f_delta = 0.05
                f_n = 1.0 / self._alternate_tree.error_width + 1.0 / self.error_width

                bound = math.sqrt(2.0 * old_error_rate *
                                  (1.0 - old_error_rate) *
                                  math.log(2.0 / f_delta) * f_n)
                if bound < (old_error_rate - alt_error_rate):
                    tree._n_active_leaves -= self.n_leaves
                    tree._n_active_leaves += self._alternate_tree.n_leaves
                    self.kill_tree_children(tree)

                    if parent is not None:
                        parent.children[parent_branch] = self._alternate_tree
                        self._alternate_tree = None
                    else:
                        # Switch tree root
                        tree._root = tree._root._alternate_tree
                    tree._n_switch_alternate_trees += 1
                elif bound < alt_error_rate - old_error_rate:
                    if isinstance(self._alternate_tree, HTBranch):
                        self._alternate_tree.kill_tree_children(tree)  # noqa
                    self._alternate_tree = None
                    tree._n_pruned_alternate_trees += 1

        # Learn one sample in alternate tree and child nodes
        if self._alternate_tree is not None:
            self._alternate_tree.learn_one(
                x,
                y,
                sample_weight=sample_weight,
                tree=tree,
                parent=parent,
                parent_branch=parent_branch,
            )

        try:
            child = self.next(x)
        except KeyError:
            child = None

        if child is not None:
            child.learn_one(
                x,
                y,
                sample_weight=sample_weight,
                tree=tree,
                parent=self,
                parent_branch=self.branch_no(x),
            )
        else:
            # Instance contains a categorical value previously unseen by the split node
            if self.max_branches() == -1 and self.feature in x:  # noqa
                # Creates a new learning node to encompass the new observed feature value
                leaf = tree._new_leaf(parent=self)
                self.add_child(x[self.feature], leaf)  # noqa
                tree._n_active_leaves += 1
                leaf.learn_one(
                    x,
                    y,
                    sample_weight=sample_weight,
                    tree=tree,
                    parent=self,
                    parent_branch=self.branch_no(x),
                )
            # The split feature is missing in the instance. Hence, we pass the new example
            # to the most traversed path in the current subtree
            else:
                child_id, child = self.most_common_path()
                child.learn_one(
                    x,
                    y,
                    sample_weight=sample_weight,
                    tree=tree,
                    parent=self,
                    parent_branch=child_id,
                )

    # Override AdaNode
    def kill_tree_children(self, tree):
        for child in self.children:
            # Delete alternate tree if it exists
            if isinstance(child, HTBranch):
                if child._alternate_tree is not None:
                    child._alternate_tree.kill_tree_children(tree)
                    tree._n_pruned_alternate_trees += 1
                    child._alternate_tree = None

                # Recursive delete of SplitNodes
                child.kill_tree_children(tree)  # noqa
            else:
                if child.is_active():  # noqa
                    tree._n_active_leaves -= 1
                else:
                    tree._n_inactive_leaves -= 1