Esempio n. 1
0
def test_adwin(test_path):
    """
    ADWIN drift detection test.
    The first half of the stream contains a sequence corresponding to a normal distribution of integers from 0 to 1.
    From index 999 to 1999 the sequence is a normal distribution of integers from 0 to 7.

    """
    adwin = ADWIN()
    test_file = os.path.join(test_path, 'drift_stream.npy')
    data_stream = np.load(test_file)
    expected_indices = [1023, 1055, 1087, 1151]
    detected_indices = []

    for i in range(data_stream.size):
        adwin.add_element(data_stream[i])
        if adwin.detected_change():
            detected_indices.append(i)

    assert detected_indices == expected_indices
Esempio n. 2
0
def demo():
    """ _test_adwin
    
    This demo will insert data into an ADWIN object when will display in which 
    indexes change was detected.
    
    The data stream is simulated as a sequence of randomly generated 0's and 1's. 
    Then the data from indexes 999 to 1999 is changed to a normal distribution of 
    integers from 0 to 7.
    
    """
    adwin = ADWIN()
    size = 2000
    data_stream = np.random.randint(2, size=size)
    for i in range(999, size):
        data_stream[i] = np.random.randint(8)

    for i in range(size):
        adwin.add_element(data_stream[i])
        if adwin.detected_change():
            print('Change has been detected in data: ' + str(data_stream[i]) +
                  ' - of index: ' + str(i))
    class AdaLearningNode(LearningNodeNBAdaptive, NewNode):
        def __init__(self, initial_class_observations):
            LearningNodeNBAdaptive.__init__(self, initial_class_observations)
            self.estimationErrorWeight = ADWIN()
            self.ErrorChange = False
            self.randomSeed = 1
            self.classifierRandom = random.seed(self.randomSeed)

        def calc_byte_size(self):
            byte_size = self.__sizeof__()
            if self.estimationErrorWeight is not None:
                byte_size += self.estimationErrorWeight.get_length_estimation()
            return byte_size

        # Override NewNode
        def number_leaves(self):
            return 1

        # Override NewNode
        def get_error_estimation(self):
            return self.estimationErrorWeight._estimation

        # Override NewNode
        def get_error_width(self):
            return self.estimationErrorWeight._width

        # Override NewNode
        def is_null_error(self):
            return (self.estimationErrorWeight is None)

        def kill_tree_childs(self, hat):
            pass

        # Override NewNode
        def learn_from_instance(self, X, y, weight, hat, parent,
                                parent_branch):
            true_class = y

            k = np.random.poisson(1.0, self.classifierRandom)
            if k > 0:
                weight = weight * k

            tmp = self.get_class_votes(X, hat)

            class_prediction = get_max_value_index(tmp)

            bl_correct = (true_class == class_prediction)

            if self.estimationErrorWeight is None:
                self.estimationErrorWeight = ADWIN()

            old_error = self.get_error_estimation()

            # Add element to Adwin
            add = 0.0 if (bl_correct is True) else 1.0

            self.estimationErrorWeight.add_element(add)
            # Detect change with Adwin
            self.ErrorChange = self.estimationErrorWeight.detected_change()

            if (self.ErrorChange is True
                    and old_error > self.get_error_estimation()):
                self.ErrorChange = False

            # Update statistics call LearningNodeNBAdaptive
            super().learn_from_instance(X, y, weight,
                                        hat)  # CHECK changed self to super

            # call ActiveLearningNode
            weight_seen = self.get_weight_seen()

            if weight_seen - self.get_weight_seen_at_last_split_evaluation(
            ) >= hat.grace_period:
                hat._attempt_to_split(self, parent, parent_branch)
                self.set_weight_seen_at_last_split_evaluation(weight_seen)

        # Override LearningNodeNBAdaptive
        def get_class_votes(self, X, ht):

            dist = {}
            prediction_option = ht.leaf_prediction

            if prediction_option == MAJORITY_CLASS:  #MC
                dist = self.get_observed_class_distribution()
            elif prediction_option == NAIVE_BAYES:  #NB
                dist = do_naive_bayes_prediction(
                    X, self._observed_class_distribution,
                    self._attribute_observers)

            # NBAdaptive
            if self._mc_correct_weight > self._nb_correct_weight:
                dist = self.get_observed_class_distribution()
            else:
                dist = do_naive_bayes_prediction(
                    X, self._observed_class_distribution,
                    self._attribute_observers)

            dist_sum = sum(dist.values())  # sum all values in dictionary

            if dist_sum * self.get_error_estimation(
            ) * self.get_error_estimation() > 0.0:
                normalize_values_in_dict(
                    dist_sum * self.get_error_estimation() *
                    self.get_error_estimation(), dist)

            return dist

        # Override NewNode, New for option votes
        def filter_instance_to_leaves(self,
                                      X,
                                      split_parent,
                                      parent_branch,
                                      update_splitter_counts,
                                      found_nodes=None):
            if found_nodes is None:
                found_nodes = []
            found_nodes.append(
                HoeffdingTree.FoundNode(self, split_parent, parent_branch))
    class AdaSplitNode(SplitNode, NewNode):
        def __init__(self, split_test, class_observations, size):
            SplitNode.__init__(self, split_test, class_observations, size)
            self._estimation_error_weight = ADWIN()
            self._alternate_tree = None  # CHECK not HoeffdingTree.Node(), I force alternatetree to be None so that will be that initialized as _new_learning_node (line 154)
            self.error_change = False
            self._random_seed = 1
            self._classifier_random = random.seed(self._random_seed)

        # Override SplitNode
        def calc_byte_size_including_subtree(self):
            byte_size = self.__sizeof__()
            if self._alternate_tree is not None:
                byte_size += self._alternate_tree.calc_byte_size_including_subtree(
                )
            if self._estimation_error_weight is not None:
                byte_size += self._estimation_error_weight.get_length_estimation(
                )

            for child in self._children:
                if child is not None:
                    byte_size += child.calc_byte_size_including_subtree()

            return byte_size

        # Override NewNode
        def number_leaves(self):
            num_of_leaves = 0
            for child in self._children:
                if child is not None:
                    num_of_leaves += child.number_leaves()

            return num_of_leaves

        # Override NewNode
        def get_error_estimation(self):
            return self._estimation_error_weight._estimation

        # Override NewNode
        def get_error_width(self):
            w = 0.0
            if (self.is_null_error() is False):
                w = self._estimation_error_weight._width

            return w

        # Override NewNode
        def is_null_error(self):
            return (self._estimation_error_weight is None)

        # Override NewNode
        def learn_from_instance(self, X, y, weight, hat, parent,
                                parent_branch):

            true_class = y
            class_prediction = 0

            if (self.filter_instance_to_leaf(X, parent,
                                             parent_branch).node) is not None:
                class_prediction = get_max_value_index(
                    self.filter_instance_to_leaf(
                        X, parent, parent_branch).node.get_class_votes(X, hat))

            bl_correct = (true_class == class_prediction)

            if self._estimation_error_weight is None:
                self._estimation_error_weight = ADWIN()

            old_error = self.get_error_estimation()

            # Add element to Adwin
            add = 0.0 if (bl_correct is True) else 1.0

            self._estimation_error_weight.add_element(add)
            # Detect change with Adwin
            self.error_change = self._estimation_error_weight.detected_change()

            if (self.error_change is True
                    and old_error > self.get_error_estimation()):
                self.error_change = False

            #Check condition to build a new alternate tree
            if (self.error_change is True):
                self._alternate_tree = hat._new_learning_node(
                )  # check call to new learning node
                hat._alternateTrees += 1

            #Condition to replace alternate tree
            elif (self._alternate_tree is not None
                  and self._alternate_tree.is_null_error() is False):
                if (self.get_error_width() > error_width_threshold
                        and self._alternate_tree.get_error_width() >
                        error_width_threshold):
                    old_error_rate = self.get_error_estimation()
                    alt_error_rate = self._alternate_tree.get_error_estimation(
                    )
                    fDelta = .05
                    fN = 1.0 / self._alternate_tree.get_error_width() + 1.0 / (
                        self.get_error_width())

                    bound = math.sqrt(2.0 * old_error_rate *
                                      (1.0 - old_error_rate) *
                                      math.log(2.0 / fDelta) * fN)
                    # To check, bound never less than (old_error_rate - alt_error_rate)
                    if bound < (old_error_rate - alt_error_rate):
                        hat._active_leaf_node_cnt -= self.number_leaves()
                        hat._active_leaf_node_cnt += self._alternate_tree.number_leaves(
                        )
                        self.kill_tree_childs(hat)

                        if parent is not None:
                            parent.set_child(parent_branch,
                                             self._alternate_tree)
                        else:
                            hat._tree_root = hat._tree_root.alternateTree
                        hat._switchAlternateTrees += 1
                    elif (bound < alt_error_rate - old_error_rate):
                        if isinstance(self._alternate_tree,
                                      HAT.ActiveLearningNode):
                            self._alternate_tree = None
                        elif (isinstance(self._alternate_tree,
                                         HAT.ActiveLearningNode)):
                            self._alternate_tree = None
                        else:
                            self._alternate_tree.kill_tree_childs(hat)
                        hat._prunedalternateTree += 1  # hat._pruned_alternate_trees to check

            # Learn_From_Instance alternate Tree and Child nodes
            if self._alternate_tree is not None:
                self._alternate_tree.learn_from_instance(
                    X, y, weight, hat, parent, parent_branch)

            child_branch = self.instance_child_index(X)
            child = self.get_child(child_branch)

            if child is not None:
                child.learn_from_instance(X, y, weight, hat, parent,
                                          parent_branch)

        # Override NewNode
        def kill_tree_childs(self, hat):
            for child in self._children:
                if child is not None:
                    # Delete alternate tree if it exists
                    if (isinstance(child, HAT.AdaSplitNode)
                            and child._alternate_tree is not None):
                        self._pruned_alternate_trees += 1
                    # Recursive delete of SplitNodes
                    if isinstance(child, HAT.AdaSplitNode):
                        child.kill_tree_childs(hat)

                    if isinstance(child, HAT.ActiveLearningNode):
                        child = None
                        hat._active_leaf_node_cnt -= 1
                    elif isinstance(child, HAT.InactiveLearningNode):
                        child = None
                        hat._inactive_leaf_node_cnt -= 1

        # override NewNode
        def filter_instance_to_leaves(self,
                                      X,
                                      parent,
                                      parent_branch,
                                      update_splitter_counts,
                                      found_nodes=None):
            if found_nodes is None:
                found_nodes = []

            child_index = self.instance_child_index(X)

            if child_index >= 0:
                child = self.get_child(child_index)

                if child is not None:
                    child.filter_instance_to_leaves(X, parent, parent_branch,
                                                    update_splitter_counts,
                                                    found_nodes)
                else:
                    found_nodes.append(
                        HoeffdingTree.FoundNode(None, self, child_index))
            if self._alternate_tree is not None:
                self._alternate_tree.filter_instance_to_leaves(
                    X, self, -999, update_splitter_counts, found_nodes)
Esempio n. 5
0
class KNNAdwin(KNN):
    """ K-Nearest Neighbors Classifier with ADWIN Change detector 
    
    This Classifier is an improvement from the regular KNN classifier, 
    as it is resistant to concept drift. It utilises the ADWIN change 
    detector to decide which samples to keep and which ones to forget, 
    and by doing so it regulates the sample window size.
     
    To know more about the ADWIN change detector, please visit 
    skmultiflow.classification.core.driftdetection.adwin

    It uses the regular KNN Classifier as a base class, with the 
    major difference that this class keeps a variable size window, 
    instead of a fixed size one and also it updates the adwin algorithm 
    at each partial_fit call.
    
    Parameters
    ----------
    k: int
        The number of nearest neighbors to search for.
        
    max_window_size: int
        The maximum size of the window storing the last viewed samples.
        
    leaf_size: int
        The maximum number of samples that can be stored in one leaf node, 
        which determines from which point the algorithm will switch for a 
        brute-force approach. The bigger this number the faster the tree 
        construction time, but the slower the query time will be.
        
    categorical_list: An array-like
        Each entry is the index of a categorical feature. May be requested 
        further filtering.
        
    Raises
    ------
    NotImplementedError: A few of the functions described here are not 
    implemented since they have no application in this context.
    
    ValueError: A ValueError is raised if the predict function is called 
    before at least k samples have been analyzed by the algorithm.
    
    Examples
    --------
    >>> # Imports
    >>> from skmultiflow.classification.lazy.knn_adwin import KNNAdwin
    >>> from skmultiflow.classification.lazy.knn import KNN
    >>> from skmultiflow.data.file_stream import FileStream
    >>> from skmultiflow.options.file_option import FileOption
    >>> # Setting up the stream
    >>> opt = FileOption('FILE', 'OPT_NAME', 'skmultiflow/datasets/covtype.csv', 'csv', False)
    >>> stream = FileStream(opt, -1, 1)
    >>> stream.prepare_for_use()
    >>> # Setting up the KNNAdwin classifier
    >>> knn_adwin = KNNAdwin(k=8, leaf_size=40, max_window_size=2000)
    >>> # Pre training the classifier with 200 samples
    >>> X, y = stream.next_instance(200)
    >>> knn_adwin = knn_adwin.partial_fit(X, y)
    >>> # Keeping track of sample count and correct prediction count
    >>> n_samples = 0
    >>> corrects = 0
    >>> while n_samples < 5000:
    ...     X, y = stream.next_instance()
    ...     pred = knn_adwin.predict(X)
    ...     if y[0] == pred[0]:
    ...         corrects += 1
    ...     knn_adwin = knn_adwin.partial_fit(X, y)
    ...     n_samples += 1
    >>>
    >>> # Displaying the results
    >>> print('KNN usage example')
    >>> print(str(n_samples) + ' samples analyzed.')
    5000 samples analyzed.
    >>> print("KNNAdwin's performance: " + str(corrects/n_samples))
    KNNAdwin's performance: 0.7798

    """

    def __init__(self, k=5, max_window_size=sys.maxsize, leaf_size=30, categorical_list=[]):
        super().__init__(k=k, max_window_size=max_window_size, leaf_size=leaf_size, categorical_list=categorical_list)
        self.adwin = ADWIN()
        self.window = None

    def reset(self):
        """ reset
        
        Resets the adwin algorithm as well as the base model 
        kept by the KNN base class.
        
        Returns
        -------
        KNNAdwin
            self
        
        """
        self.adwin = ADWIN()
        return super().reset()

    def partial_fit(self, X, y, classes=None, weight=None):
        """ partial_fit
        
        Partially fits the model. This is done by updating the window 
        with new samples while also updating the adwin algorithm. Then 
        we verify if a change was detected, and if so, the window is 
        correctly split at the drift moment.
        
        Parameters
        ----------
        X: Numpy.ndarray of shape (n_samples, n_features)
            The data upon which the algorithm will create its model.
            
        y: Array-like
            An array-like containing the classification targets for all 
            samples in X.
            
        classes: Not used.

        weight: Not used.
        
        Returns
        -------
        KNNAdwin
            self
        
        """
        r, c = get_dimensions(X)
        if self.window is None:
            self.window = InstanceWindow(max_size=self.max_window_size)

        for i in range(r):
            if r > 1:
                self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]]))
            else:
                self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]]))
            if self.window._num_samples >= self.k:
                add = 1 if self.predict(np.asarray([X[i]])) == y[i] else 0
                self.adwin.add_element(add)
            else:
                self.adwin.add_element(0)

        if self.window._num_samples >= self.k:
            changed = self.adwin.detected_change()

            if changed:
                if self.adwin._width < self.window._num_samples:
                    for i in range(self.window._num_samples, self.adwin._width, -1):
                        self.window.delete_element()
        return self
Esempio n. 6
0
# Imports
import numpy as np
from skmultiflow.classification.core.driftdetection.adwin import ADWIN
adwin = ADWIN()
# Simulating a data stream as a normal distribution of 1's and 0's
data_stream = np.random.randint(2, size=2000)
# Changing the data concept from index 999 to 2000
for i in range(999, 2000):
    data_stream[i] = np.random.randint(4, high=8)
# Adding stream elements to ADWIN and verifying if drift occurred
for i in range(2000):
    adwin.add_element(data_stream[i])
    if adwin.detected_change():
        print('Change has been detected in data: ' + str(data_stream[i]) +
              ' - of index: ' + str(i))