def __init__(self, split_test, class_observations): SplitNode.__init__(self, split_test, class_observations) self._estimation_error_weight = ADWIN() self._alternate_tree = None self.error_change = False self._random_seed = 1 self._classifier_random = random.seed(self._random_seed)
def __init__(self, split_test, class_observations, size): SplitNode.__init__(self, split_test, class_observations, size) self._estimation_error_weight = ADWIN() self._alternate_tree = None # CHECK not HoeffdingTree.Node(), I force alternatetree to be None so that will be that initialized as _new_learning_node (line 154) self.error_change = False self._random_seed = 1 self._classifier_random = random.seed(self._random_seed)
def __init__(self, nb_ensemble=10, max_features='auto', disable_weighted_vote=False, lambda_value=6, performance_metric='acc', drift_detection_method: BaseDriftDetector = ADWIN(0.001), warning_detection_method: BaseDriftDetector = ADWIN(0.01), max_byte_size=33554432, memory_estimate_period=2000000, grace_period=50, split_criterion='info_gain', split_confidence=0.01, tie_threshold=0.05, binary_split=False, stop_mem_management=False, remove_poor_atts=False, no_preprune=False, leaf_prediction='nba', nb_threshold=0, nominal_attributes=None, random_state=None): """AdaptiveRandomForest class constructor.""" super().__init__() self.nb_ensemble = nb_ensemble self.max_features = max_features self.disable_weighted_vote = disable_weighted_vote self.lambda_value = lambda_value if isinstance(drift_detection_method, BaseDriftDetector): self.drift_detection_method = drift_detection_method else: self.drift_detection_method = None if isinstance(warning_detection_method, BaseDriftDetector): self.warning_detection_method = warning_detection_method else: self.warning_detection_method = None self.instances_seen = 0 self._train_weight_seen_by_model = 0.0 self.ensemble = None self.random_state = check_random_state(random_state) if performance_metric in ['acc', 'kappa']: self.performance_metric = performance_metric else: raise ValueError( 'Invalid performance metric: {}'.format(performance_metric)) # ARH Hoeffding Tree configuration self.max_byte_size = max_byte_size self.memory_estimate_period = memory_estimate_period self.grace_period = grace_period self.split_criterion = split_criterion self.split_confidence = split_confidence self.tie_threshold = tie_threshold self.binary_split = binary_split self.stop_mem_management = stop_mem_management self.remove_poor_atts = remove_poor_atts self.no_preprune = no_preprune self.leaf_prediction = leaf_prediction self.nb_threshold = nb_threshold self.nominal_attributes = nominal_attributes
def __init__(self, k=5, max_window_size=sys.maxsize, leaf_size=30, categorical_list=[]): super().__init__(k=k, max_window_size=max_window_size, leaf_size=leaf_size, categorical_list=categorical_list) self.adwin = ADWIN() self.window = None
def reset(self): """ reset Resets the adwin algorithm as well as the base model kept by the KNN base class. Returns ------- KNNAdwin self """ self.adwin = ADWIN() return super().reset()
def __adjust_ensemble_size(self): if len(self.classes) != len(self.ensemble): if len(self.classes) > len(self.ensemble): for i in range(len(self.ensemble), len(self.classes)): self.ensemble.append(cp.deepcopy(self.h)) self.adwin_ensemble.append(ADWIN(self.delta)) self.ensemble_length += 1
def __init__(self, h=KNN(), ensemble_length=2, w=6, delta=0.002, enable_code_matrix=False, leverage_algorithm='leveraging_bag'): super().__init__() # default values self.h = h.reset() self.ensemble_length = None self.ensemble = None self.adwin_ensemble = None self.n_detected_changes = None self.matrix_codes = None self.enable_matrix_codes = None self.w = None self.delta = None self.classes = None self.leveraging_algorithm = None self.__configure(h, ensemble_length, w, delta, enable_code_matrix, leverage_algorithm) self.init_matrix_codes = True self.adwin_ensemble = [] for i in range(ensemble_length): self.adwin_ensemble.append(ADWIN(self.delta))
def test_adwin(test_path): """ ADWIN drift detection test. The first half of the stream contains a sequence corresponding to a normal distribution of integers from 0 to 1. From index 999 to 1999 the sequence is a normal distribution of integers from 0 to 7. """ adwin = ADWIN() test_file = os.path.join(test_path, 'drift_stream.npy') data_stream = np.load(test_file) expected_indices = [1023, 1055, 1087, 1151] detected_indices = [] for i in range(data_stream.size): adwin.add_element(data_stream[i]) if adwin.detected_change(): detected_indices.append(i) assert detected_indices == expected_indices
def learn_from_instance(self, X, y, weight, hat, parent, parent_branch): true_class = y k = np.random.poisson(1.0, self.classifierRandom) if k > 0: weight = weight * k tmp = self.get_class_votes(X, hat) class_prediction = get_max_value_index(tmp) bl_correct = (true_class == class_prediction) if self.estimationErrorWeight is None: self.estimationErrorWeight = ADWIN() old_error = self.get_error_estimation() # Add element to Adwin add = 0.0 if (bl_correct is True) else 1.0 self.estimationErrorWeight.add_element(add) # Detect change with Adwin self.ErrorChange = self.estimationErrorWeight.detected_change() if (self.ErrorChange is True and old_error > self.get_error_estimation()): self.ErrorChange = False # Update statistics call LearningNodeNBAdaptive super().learn_from_instance(X, y, weight, hat) # CHECK changed self to super # call ActiveLearningNode weight_seen = self.get_weight_seen() if weight_seen - self.get_weight_seen_at_last_split_evaluation( ) >= hat.grace_period: hat._attempt_to_split(self, parent, parent_branch) self.set_weight_seen_at_last_split_evaluation(weight_seen)
def __init__(self, h=KNNAdwin(), ensemble_length=2): super().__init__() # default values self.ensemble = None self.ensemble_length = None self.classes = None self.h = h.reset() self.__configure(h, ensemble_length) self.adwin_ensemble = [] for i in range(ensemble_length): self.adwin_ensemble.append(ADWIN())
def demo(): """ _test_adwin This demo will insert data into an ADWIN object when will display in which indexes change was detected. The data stream is simulated as a sequence of randomly generated 0's and 1's. Then the data from indexes 999 to 1999 is changed to a normal distribution of integers from 0 to 7. """ adwin = ADWIN() size = 2000 data_stream = np.random.randint(2, size=size) for i in range(999, size): data_stream[i] = np.random.randint(8) for i in range(size): adwin.add_element(data_stream[i]) if adwin.detected_change(): print('Change has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i))
def reset(self): """ reset Resets all the classifiers, as well as all the ADWIN change detectors. Returns ------- LeverageBagging self """ self.__configure(self.h, self.ensemble_length, self.w, self.delta, self.enable_matrix_codes) self.adwin_ensemble = [] for i in range(self.ensemble_length): self.adwin_ensemble.append(ADWIN(self.delta)) self.n_detected_changes = 0 self.classes = None self.init_matrix_codes = True return self
def __partial_fit(self, X, y): n_classes = len(self.classes) change = False if self.init_matrix_codes: self.matrix_codes = np.zeros( (self.ensemble_length, len(self.classes)), dtype=int) for i in range(self.ensemble_length): n_zeros = 0 n_ones = 0 while ((n_ones - n_zeros) * (n_ones - n_zeros) > self.ensemble_length % 2): n_zeros = 0 n_ones = 0 for j in range(len(self.classes)): result = 0 if (j == 1) and (len(self.classes) == 2): result = 1 - self.matrix_codes[i][0] else: result = np.random.randint(2) self.matrix_codes[i][j] = result if result == 1: n_ones += 1 else: n_zeros += 1 self.init_matrix_codes = False detected_change = False X_cp, y_cp = cp.deepcopy(X), cp.deepcopy(y) for i in range(self.ensemble_length): k = 0.0 if self.leveraging_algorithm == self.LEVERAGE_ALGORITHMS[0]: k = np.random.poisson(self.w) elif self.leveraging_algorithm == self.LEVERAGE_ALGORITHMS[1]: error = self.adwin_ensemble[i]._estimation pred = self.ensemble[i].predict(np.asarray([X])) if pred is None: k = 1.0 elif pred[0] != y: k = 1.0 elif np.random.rand() < (error / (1.0 - error)): k = 1.0 else: k = 0.0 elif self.leveraging_algorithm == self.LEVERAGE_ALGORITHMS[2]: w = 1.0 k = 0.0 if (np.random.randint(2) == 1) else w elif self.leveraging_algorithm == self.LEVERAGE_ALGORITHMS[3]: w = 1.0 k = 1.0 + np.random.poisson(w) elif self.leveraging_algorithm == self.LEVERAGE_ALGORITHMS[4]: w = 1.0 k = np.random.poisson(1) k = w if k > 0 else 0 if k > 0: if self.enable_matrix_codes: y_cp = self.matrix_codes[i][int(y_cp)] for l in range(int(k)): self.ensemble[i].partial_fit(np.asarray([X_cp]), np.asarray([y_cp]), self.classes) try: pred = self.ensemble[i].predict(np.asarray([X])) if pred is not None: add = 1 if (pred[0] == y_cp) else 0 error = self.adwin_ensemble[i]._estimation self.adwin_ensemble[i].add_element(add) if self.adwin_ensemble[i].detected_change(): if self.adwin_ensemble[i]._estimation > error: change = True except ValueError: change = False if change: self.n_detected_changes += 1 max = 0.0 imax = -1 for i in range(self.ensemble_length): if max < self.adwin_ensemble[i]._estimation: max = self.adwin_ensemble[i]._estimation imax = i if imax != -1: self.ensemble[imax].reset() self.adwin_ensemble[imax] = ADWIN(self.delta) return self
def __init__(self, initial_class_observations): LearningNodeNBAdaptive.__init__(self, initial_class_observations) self.estimationErrorWeight = ADWIN() self.ErrorChange = False self.randomSeed = 1 self.classifierRandom = random.seed(self.randomSeed)
class AdaLearningNode(LearningNodeNBAdaptive, NewNode): def __init__(self, initial_class_observations): LearningNodeNBAdaptive.__init__(self, initial_class_observations) self.estimationErrorWeight = ADWIN() self.ErrorChange = False self.randomSeed = 1 self.classifierRandom = random.seed(self.randomSeed) def calc_byte_size(self): byte_size = self.__sizeof__() if self.estimationErrorWeight is not None: byte_size += self.estimationErrorWeight.get_length_estimation() return byte_size # Override NewNode def number_leaves(self): return 1 # Override NewNode def get_error_estimation(self): return self.estimationErrorWeight._estimation # Override NewNode def get_error_width(self): return self.estimationErrorWeight._width # Override NewNode def is_null_error(self): return (self.estimationErrorWeight is None) def kill_tree_childs(self, hat): pass # Override NewNode def learn_from_instance(self, X, y, weight, hat, parent, parent_branch): true_class = y k = np.random.poisson(1.0, self.classifierRandom) if k > 0: weight = weight * k tmp = self.get_class_votes(X, hat) class_prediction = get_max_value_index(tmp) bl_correct = (true_class == class_prediction) if self.estimationErrorWeight is None: self.estimationErrorWeight = ADWIN() old_error = self.get_error_estimation() # Add element to Adwin add = 0.0 if (bl_correct is True) else 1.0 self.estimationErrorWeight.add_element(add) # Detect change with Adwin self.ErrorChange = self.estimationErrorWeight.detected_change() if (self.ErrorChange is True and old_error > self.get_error_estimation()): self.ErrorChange = False # Update statistics call LearningNodeNBAdaptive super().learn_from_instance(X, y, weight, hat) # CHECK changed self to super # call ActiveLearningNode weight_seen = self.get_weight_seen() if weight_seen - self.get_weight_seen_at_last_split_evaluation( ) >= hat.grace_period: hat._attempt_to_split(self, parent, parent_branch) self.set_weight_seen_at_last_split_evaluation(weight_seen) # Override LearningNodeNBAdaptive def get_class_votes(self, X, ht): dist = {} prediction_option = ht.leaf_prediction if prediction_option == MAJORITY_CLASS: #MC dist = self.get_observed_class_distribution() elif prediction_option == NAIVE_BAYES: #NB dist = do_naive_bayes_prediction( X, self._observed_class_distribution, self._attribute_observers) # NBAdaptive if self._mc_correct_weight > self._nb_correct_weight: dist = self.get_observed_class_distribution() else: dist = do_naive_bayes_prediction( X, self._observed_class_distribution, self._attribute_observers) dist_sum = sum(dist.values()) # sum all values in dictionary if dist_sum * self.get_error_estimation( ) * self.get_error_estimation() > 0.0: normalize_values_in_dict( dist_sum * self.get_error_estimation() * self.get_error_estimation(), dist) return dist # Override NewNode, New for option votes def filter_instance_to_leaves(self, X, split_parent, parent_branch, update_splitter_counts, found_nodes=None): if found_nodes is None: found_nodes = [] found_nodes.append( HoeffdingTree.FoundNode(self, split_parent, parent_branch))
def learn_from_instance(self, X, y, weight, hat, parent, parent_branch): true_class = y class_prediction = 0 if (self.filter_instance_to_leaf(X, parent, parent_branch).node) is not None: class_prediction = get_max_value_index( self.filter_instance_to_leaf( X, parent, parent_branch).node.get_class_votes(X, hat)) bl_correct = (true_class == class_prediction) if self._estimation_error_weight is None: self._estimation_error_weight = ADWIN() old_error = self.get_error_estimation() # Add element to Adwin add = 0.0 if (bl_correct is True) else 1.0 self._estimation_error_weight.add_element(add) # Detect change with Adwin self.error_change = self._estimation_error_weight.detected_change() if (self.error_change is True and old_error > self.get_error_estimation()): self.error_change = False #Check condition to build a new alternate tree if (self.error_change is True): self._alternate_tree = hat._new_learning_node( ) # check call to new learning node hat._alternateTrees += 1 #Condition to replace alternate tree elif (self._alternate_tree is not None and self._alternate_tree.is_null_error() is False): if (self.get_error_width() > error_width_threshold and self._alternate_tree.get_error_width() > error_width_threshold): old_error_rate = self.get_error_estimation() alt_error_rate = self._alternate_tree.get_error_estimation( ) fDelta = .05 fN = 1.0 / self._alternate_tree.get_error_width() + 1.0 / ( self.get_error_width()) bound = math.sqrt(2.0 * old_error_rate * (1.0 - old_error_rate) * math.log(2.0 / fDelta) * fN) # To check, bound never less than (old_error_rate - alt_error_rate) if bound < (old_error_rate - alt_error_rate): hat._active_leaf_node_cnt -= self.number_leaves() hat._active_leaf_node_cnt += self._alternate_tree.number_leaves( ) self.kill_tree_childs(hat) if parent is not None: parent.set_child(parent_branch, self._alternate_tree) else: hat._tree_root = hat._tree_root.alternateTree hat._switchAlternateTrees += 1 elif (bound < alt_error_rate - old_error_rate): if isinstance(self._alternate_tree, HAT.ActiveLearningNode): self._alternate_tree = None elif (isinstance(self._alternate_tree, HAT.ActiveLearningNode)): self._alternate_tree = None else: self._alternate_tree.kill_tree_childs(hat) hat._prunedalternateTree += 1 # hat._pruned_alternate_trees to check # Learn_From_Instance alternate Tree and Child nodes if self._alternate_tree is not None: self._alternate_tree.learn_from_instance( X, y, weight, hat, parent, parent_branch) child_branch = self.instance_child_index(X) child = self.get_child(child_branch) if child is not None: child.learn_from_instance(X, y, weight, hat, parent, parent_branch)
# Imports import numpy as np from skmultiflow.classification.core.driftdetection.adwin import ADWIN adwin = ADWIN() # Simulating a data stream as a normal distribution of 1's and 0's data_stream = np.random.randint(2, size=2000) # Changing the data concept from index 999 to 2000 for i in range(999, 2000): data_stream[i] = np.random.randint(4, high=8) # Adding stream elements to ADWIN and verifying if drift occurred for i in range(2000): adwin.add_element(data_stream[i]) if adwin.detected_change(): print('Change has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i))
class AdaSplitNode(SplitNode, NewNode): def __init__(self, split_test, class_observations, size): SplitNode.__init__(self, split_test, class_observations, size) self._estimation_error_weight = ADWIN() self._alternate_tree = None # CHECK not HoeffdingTree.Node(), I force alternatetree to be None so that will be that initialized as _new_learning_node (line 154) self.error_change = False self._random_seed = 1 self._classifier_random = random.seed(self._random_seed) # Override SplitNode def calc_byte_size_including_subtree(self): byte_size = self.__sizeof__() if self._alternate_tree is not None: byte_size += self._alternate_tree.calc_byte_size_including_subtree( ) if self._estimation_error_weight is not None: byte_size += self._estimation_error_weight.get_length_estimation( ) for child in self._children: if child is not None: byte_size += child.calc_byte_size_including_subtree() return byte_size # Override NewNode def number_leaves(self): num_of_leaves = 0 for child in self._children: if child is not None: num_of_leaves += child.number_leaves() return num_of_leaves # Override NewNode def get_error_estimation(self): return self._estimation_error_weight._estimation # Override NewNode def get_error_width(self): w = 0.0 if (self.is_null_error() is False): w = self._estimation_error_weight._width return w # Override NewNode def is_null_error(self): return (self._estimation_error_weight is None) # Override NewNode def learn_from_instance(self, X, y, weight, hat, parent, parent_branch): true_class = y class_prediction = 0 if (self.filter_instance_to_leaf(X, parent, parent_branch).node) is not None: class_prediction = get_max_value_index( self.filter_instance_to_leaf( X, parent, parent_branch).node.get_class_votes(X, hat)) bl_correct = (true_class == class_prediction) if self._estimation_error_weight is None: self._estimation_error_weight = ADWIN() old_error = self.get_error_estimation() # Add element to Adwin add = 0.0 if (bl_correct is True) else 1.0 self._estimation_error_weight.add_element(add) # Detect change with Adwin self.error_change = self._estimation_error_weight.detected_change() if (self.error_change is True and old_error > self.get_error_estimation()): self.error_change = False #Check condition to build a new alternate tree if (self.error_change is True): self._alternate_tree = hat._new_learning_node( ) # check call to new learning node hat._alternateTrees += 1 #Condition to replace alternate tree elif (self._alternate_tree is not None and self._alternate_tree.is_null_error() is False): if (self.get_error_width() > error_width_threshold and self._alternate_tree.get_error_width() > error_width_threshold): old_error_rate = self.get_error_estimation() alt_error_rate = self._alternate_tree.get_error_estimation( ) fDelta = .05 fN = 1.0 / self._alternate_tree.get_error_width() + 1.0 / ( self.get_error_width()) bound = math.sqrt(2.0 * old_error_rate * (1.0 - old_error_rate) * math.log(2.0 / fDelta) * fN) # To check, bound never less than (old_error_rate - alt_error_rate) if bound < (old_error_rate - alt_error_rate): hat._active_leaf_node_cnt -= self.number_leaves() hat._active_leaf_node_cnt += self._alternate_tree.number_leaves( ) self.kill_tree_childs(hat) if parent is not None: parent.set_child(parent_branch, self._alternate_tree) else: hat._tree_root = hat._tree_root.alternateTree hat._switchAlternateTrees += 1 elif (bound < alt_error_rate - old_error_rate): if isinstance(self._alternate_tree, HAT.ActiveLearningNode): self._alternate_tree = None elif (isinstance(self._alternate_tree, HAT.ActiveLearningNode)): self._alternate_tree = None else: self._alternate_tree.kill_tree_childs(hat) hat._prunedalternateTree += 1 # hat._pruned_alternate_trees to check # Learn_From_Instance alternate Tree and Child nodes if self._alternate_tree is not None: self._alternate_tree.learn_from_instance( X, y, weight, hat, parent, parent_branch) child_branch = self.instance_child_index(X) child = self.get_child(child_branch) if child is not None: child.learn_from_instance(X, y, weight, hat, parent, parent_branch) # Override NewNode def kill_tree_childs(self, hat): for child in self._children: if child is not None: # Delete alternate tree if it exists if (isinstance(child, HAT.AdaSplitNode) and child._alternate_tree is not None): self._pruned_alternate_trees += 1 # Recursive delete of SplitNodes if isinstance(child, HAT.AdaSplitNode): child.kill_tree_childs(hat) if isinstance(child, HAT.ActiveLearningNode): child = None hat._active_leaf_node_cnt -= 1 elif isinstance(child, HAT.InactiveLearningNode): child = None hat._inactive_leaf_node_cnt -= 1 # override NewNode def filter_instance_to_leaves(self, X, parent, parent_branch, update_splitter_counts, found_nodes=None): if found_nodes is None: found_nodes = [] child_index = self.instance_child_index(X) if child_index >= 0: child = self.get_child(child_index) if child is not None: child.filter_instance_to_leaves(X, parent, parent_branch, update_splitter_counts, found_nodes) else: found_nodes.append( HoeffdingTree.FoundNode(None, self, child_index)) if self._alternate_tree is not None: self._alternate_tree.filter_instance_to_leaves( X, self, -999, update_splitter_counts, found_nodes)
def __init__(self, initial_class_observations): LearningNodeNBAdaptive.__init__(self, initial_class_observations) self.estimationErrorWeight = ADWIN() self.ErrorChange = False self._randomSeed = 1 self._classifier_random = check_random_state(self._randomSeed)
def reset(self): self.__configure(self.h, self.ensemble_length) self.adwin_ensemble = [] for i in range(self.ensemble_length): self.adwin_ensemble.append(ADWIN())
def partial_fit(self, X, y, classes=None, weight=None): """ partial_fit Partially fits the model, based on the X and y matrix. Since it's an ensemble learner, if X and y matrix of more than one sample are passed, the algorithm will partial fit the model one sample at a time. Each sample is trained by each classifier a total of K times, where K is drawn by a Poisson(1) distribution. Alongside updating the model, the learner will also update ADWIN's statistics over the new samples, so that the change detector can evaluate if a concept drift was detected. In the case drift is detected, the bagging algorithm will find the worst performing classifier and reset its statistics and window. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Features matrix used for partially updating the model. y: Array-like An array-like of all the class labels for the samples in X. classes: list List of all existing classes. This is an optional parameter, except for the first partial_fit call, when it becomes obligatory. weight: Array-like Instance weight. If not provided, uniform weights are assumed. Raises ------ ValueError: A ValueError is raised if the 'classes' parameter is not passed in the first partial_fit call, or if they are passed in further calls but differ from the initial classes list passed. Returns _______ OzaBaggingAdwin self """ r, c = get_dimensions(X) if self.classes is None: if classes is None: raise ValueError("The first partial_fit call should pass all the classes.") else: self.classes = classes if self.classes is not None and classes is not None: if set(self.classes) == set(classes): pass else: raise ValueError( "The classes passed to the partial_fit function differ from those passed in an earlier moment.") self.__adjust_ensemble_size() change_detected = False for i in range(self.ensemble_length): k = np.random.poisson() if k > 0: for b in range(k): self.ensemble[i].partial_fit(X, y, classes, weight) try: pred = self.ensemble[i].predict(X) error_estimation = self.adwin_ensemble[i]._estimation for j in range(r): if pred[j] is not None: if pred[j] == y[j]: self.adwin_ensemble[i].add_element(1) else: self.adwin_ensemble[i].add_element(0) if self.adwin_ensemble[i].detected_change(): if self.adwin_ensemble[i]._estimation > error_estimation: change_detected = True except ValueError: change_detected = False pass if change_detected: max = 0.0 imax = -1 for i in range(self.ensemble_length): if max < self.adwin_ensemble[i]._estimation: max = self.adwin_ensemble[i]._estimation imax = i if imax != -1: self.ensemble[imax].reset() self.adwin_ensemble[imax] = ADWIN() return self
class KNNAdwin(KNN): """ K-Nearest Neighbors Classifier with ADWIN Change detector This Classifier is an improvement from the regular KNN classifier, as it is resistant to concept drift. It utilises the ADWIN change detector to decide which samples to keep and which ones to forget, and by doing so it regulates the sample window size. To know more about the ADWIN change detector, please visit skmultiflow.classification.core.driftdetection.adwin It uses the regular KNN Classifier as a base class, with the major difference that this class keeps a variable size window, instead of a fixed size one and also it updates the adwin algorithm at each partial_fit call. Parameters ---------- k: int The number of nearest neighbors to search for. max_window_size: int The maximum size of the window storing the last viewed samples. leaf_size: int The maximum number of samples that can be stored in one leaf node, which determines from which point the algorithm will switch for a brute-force approach. The bigger this number the faster the tree construction time, but the slower the query time will be. categorical_list: An array-like Each entry is the index of a categorical feature. May be requested further filtering. Raises ------ NotImplementedError: A few of the functions described here are not implemented since they have no application in this context. ValueError: A ValueError is raised if the predict function is called before at least k samples have been analyzed by the algorithm. Examples -------- >>> # Imports >>> from skmultiflow.classification.lazy.knn_adwin import KNNAdwin >>> from skmultiflow.classification.lazy.knn import KNN >>> from skmultiflow.data.file_stream import FileStream >>> from skmultiflow.options.file_option import FileOption >>> # Setting up the stream >>> opt = FileOption('FILE', 'OPT_NAME', 'skmultiflow/datasets/covtype.csv', 'csv', False) >>> stream = FileStream(opt, -1, 1) >>> stream.prepare_for_use() >>> # Setting up the KNNAdwin classifier >>> knn_adwin = KNNAdwin(k=8, leaf_size=40, max_window_size=2000) >>> # Pre training the classifier with 200 samples >>> X, y = stream.next_instance(200) >>> knn_adwin = knn_adwin.partial_fit(X, y) >>> # Keeping track of sample count and correct prediction count >>> n_samples = 0 >>> corrects = 0 >>> while n_samples < 5000: ... X, y = stream.next_instance() ... pred = knn_adwin.predict(X) ... if y[0] == pred[0]: ... corrects += 1 ... knn_adwin = knn_adwin.partial_fit(X, y) ... n_samples += 1 >>> >>> # Displaying the results >>> print('KNN usage example') >>> print(str(n_samples) + ' samples analyzed.') 5000 samples analyzed. >>> print("KNNAdwin's performance: " + str(corrects/n_samples)) KNNAdwin's performance: 0.7798 """ def __init__(self, k=5, max_window_size=sys.maxsize, leaf_size=30, categorical_list=[]): super().__init__(k=k, max_window_size=max_window_size, leaf_size=leaf_size, categorical_list=categorical_list) self.adwin = ADWIN() self.window = None def reset(self): """ reset Resets the adwin algorithm as well as the base model kept by the KNN base class. Returns ------- KNNAdwin self """ self.adwin = ADWIN() return super().reset() def partial_fit(self, X, y, classes=None, weight=None): """ partial_fit Partially fits the model. This is done by updating the window with new samples while also updating the adwin algorithm. Then we verify if a change was detected, and if so, the window is correctly split at the drift moment. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: Not used. weight: Not used. Returns ------- KNNAdwin self """ r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.max_window_size) for i in range(r): if r > 1: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) else: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) if self.window._num_samples >= self.k: add = 1 if self.predict(np.asarray([X[i]])) == y[i] else 0 self.adwin.add_element(add) else: self.adwin.add_element(0) if self.window._num_samples >= self.k: changed = self.adwin.detected_change() if changed: if self.adwin._width < self.window._num_samples: for i in range(self.window._num_samples, self.adwin._width, -1): self.window.delete_element() return self