def test_sim_crop(input_stream, file_name, crop_size=0): adwin = ADWIN() change_point = [] for i in range(len(input_stream)): adwin.add_element(input_stream[i]) if adwin.detected_change(): # plt.axvline(i, color='r', linestyle='dashed') change_point.append(i) end_point_crop = change_point[0] + crop_size start_point_crop = change_point[0] - 100 for i in change_point: if (i <= end_point_crop): plt.axvline(i, color='r', linestyle='dashed') crop_stream = input_stream[start_point_crop:end_point_crop] zoom_xi = list(range(start_point_crop, end_point_crop)) plt.plot(zoom_xi, crop_stream) plt.ylabel('value') plt.xlabel('Time') fig = plt.gcf() fig.set_size_inches(10, 5.5) plt.savefig(os.path.join('image', file_name + "_result_zoom.png"), aspect='auto', bbox_inches='tight', dpi=200) plt.show() return change_point
def perform_drift_detection(predict_dataframe, dataframe, feature_names, detector, drift_notification, token="") -> str: log("[INFO] Calling perform_drift_detection", token) log("[INFO] Selected data drift detection method: " + detector) baseline_data = dataframe.values.tolist() predict_data = predict_dataframe.values.tolist() overall_data = list() for a in baseline_data: overall_data.append(a) for b in predict_data: overall_data.append(b) overall_dataframe = pd.DataFrame(overall_data, columns=feature_names) drifts = dict() window = len(baseline_data) for feature in feature_names: detected_drifts_indices = list() # HDDM if detector == "HDDM": hddm_w = HDDM_W() for i in range(len(overall_dataframe[feature])): hddm_w.add_element(float(overall_dataframe[feature][i])) if hddm_w.detected_change() and i >= window: detected_drifts_indices.append(i - window) # Page Hinkley if detector == "Page Hinkley": ph = PageHinkley() for i in range(len(overall_dataframe[feature])): ph.add_element(float(overall_dataframe[feature][i])) if ph.detected_change() and i >= window: detected_drifts_indices.append(i - window) # ADWIN if detector == "ADWIN": adwin = ADWIN() for i in range(len(overall_dataframe[feature])): adwin.add_element(float(overall_dataframe[feature][i])) if adwin.detected_change() and i >= window: detected_drifts_indices.append(i - window) # Check for detected drifts if len(detected_drifts_indices) != 0: log("[INFO] Data drift detected in feature: " + feature) log("[INFO] The drifted rows are: " + str(detected_drifts_indices)) drifts[feature] = detected_drifts_indices if drift_notification: log("[INFO] Sending a web notification", token) message = "MaaS data drift detected from " + get_token_user( token) + " (" + token + ")" if submit_web_notification(message, token): log("[INFO] Web notification sent!") else: log("[ERROR] Error occurred while sending a web notification" ) return json.dumps(drifts, cls=NpEncoder)
def sim_adwin(input_stream, start_point=0): adwin = ADWIN(delta=.3) change_point = [] for i in range(len(input_stream)): adwin.add_element(input_stream[i]) if adwin.detected_change(): # plt.axvline(i, color='r', linestyle='dashed') change_point.append(i + start_point) # print('Change detected in data: ' + str(input_stream[i]) + ' - at index: ' + str(i)+'\n\n') return change_point
def cp_detection_ADWIN(points): from skmultiflow.drift_detection.adwin import ADWIN adwin = ADWIN() detections = [] # Adding stream elements to ADWIN and verifying if drift occurred for i in range(len(points)): adwin.add_element(points[i]) if adwin.detected_change(): detections.append(i) print('Change detected in data: ' + str(points[i]) + ' - at index: ' + str(i)) rpt.show.display(points, detections, figsize=(10, 6)) plt.title('Change Point Detection: ADWIN') plt.show()
def test_adwin(test_path): """ ADWIN drift detection test. The first half of the stream contains a sequence corresponding to a normal distribution of integers from 0 to 1. From index 999 to 1999 the sequence is a normal distribution of integers from 0 to 7. """ adwin = ADWIN() test_file = os.path.join(test_path, 'drift_stream.npy') data_stream = np.load(test_file) expected_indices = [1023, 1055, 1087, 1151] detected_indices = [] for i in range(data_stream.size): adwin.add_element(data_stream[i]) if adwin.detected_change(): detected_indices.append(i) assert detected_indices == expected_indices
def test_sim(input_stream, file_name): adwin = ADWIN() change_point = [] plt.plot(input_stream) f = open(os.path.join('results', file_name + ".txt"), "w+") for i in range(len(input_stream)): adwin.add_element(input_stream[i]) if adwin.detected_change(): plt.axvline(i, color='r', linestyle='dashed') change_point.append(i) # print('Change detected in data: ' + str(input_stream[i]) + ' - at index: ' + str(i)+'\n\n') f.write('Change detected in data: ' + str(input_stream[i]) + ' - at index: ' + str(i) + '\n\n') f.close() plt.ylabel('value') plt.xlabel('Time') plt.savefig(os.path.join('image', file_name + "_result.png"), aspect='auto', bbox_inches='tight', dpi=200) plt.show(aspect='auto') plt.show()
class AdaActiveLearningNodeRegressor(ActiveLearningNodePerceptron, AdaNode): """ Learning Node of the Hoeffding Adaptive Tree regressor. Always uses a linear perceptron model to provide predictions. Parameters ---------- initial_stats: dict In regression tasks this dictionary carries the sufficient to perform online variance calculation. They refer to the number of observations (key '0'), the sum of the target values (key '1'), and the sum of the squared target values (key '2'). parent_node: AdaLearningNodeForRegression (default=None) A node containing statistics about observed data. random_state: int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. """ def __init__(self, initial_stats=None, parent_node=None, random_state=None): super().__init__(initial_stats, parent_node, random_state) self._adwin = ADWIN() self._error_change = False # Normalization of info monitored by drift detectors (using Welford's algorithm) self._n = 0 @property def n_leaves(self): return 1 @property def error_estimation(self): return self._adwin.estimation @property def error_width(self): return self._adwin.width def error_is_null(self): return self._adwin is None def kill_tree_children(self, hat): pass def learn_one(self, X, y, weight, tree, parent, parent_branch): y_pred = self.predict_one(X, tree=tree) normalized_error = get_normalized_error(y, y_pred, self) if tree.bootstrap_sampling: # Perform bootstrap-sampling k = self._random_state.poisson(1.0) if k > 0: weight = weight * k if self._adwin is None: self._adwin = ADWIN() old_error = self.error_estimation # Add element to Adwin self._adwin.add_element(normalized_error) # Detect change with Adwin self._error_change = self._adwin.detected_change() if self._error_change and old_error > self.error_estimation: self._error_change = False # Update statistics super().learn_one(X, y, weight=weight, tree=tree) weight_seen = self.total_weight if weight_seen - self.last_split_attempt_at >= tree.grace_period: tree._attempt_to_split(self, parent, parent_branch) self.last_split_attempt_at = weight_seen def predict_one(self, X, *, tree=None): prediction_option = tree.leaf_prediction if prediction_option == tree._TARGET_MEAN: return self._stats[1] / self._stats[0] if len(self._stats) > 0 and self._stats[0] > 0 \ else 0.0 else: return super().predict_one(X, tree=tree) # New for option votes def filter_instance_to_leaves(self, X, y, weight, parent, parent_branch, update_splitter_counts, found_nodes=None): if found_nodes is None: found_nodes = [] found_nodes.append(FoundNode(self, parent, parent_branch))
class AdaSplitNodeForRegression(SplitNode, AdaNode): """ Node that splits the data in a Regression Hoeffding Adaptive Tree. Parameters ---------- split_test: skmultiflow.split_test.InstanceConditionalTest Split test. class_observations: dict In regression tasks this dictionary carries the sufficient to perform online variance calculation. They refer to the number of observations (key '0'), the sum of the target values (key '1'), and the sum of the squared target values (key '2'). """ def __init__(self, split_test, class_observations): super().__init__(split_test, class_observations) self._estimation_error_weight = ADWIN() self._alternate_tree = None self.error_change = False self._random_seed = 1 self._classifier_random = check_random_state(self._random_seed) # Override AdaNode def number_leaves(self): num_of_leaves = 0 for child in self._children: if child is not None: num_of_leaves += child.number_leaves() return num_of_leaves # Override AdaNode def get_error_estimation(self): return self._estimation_error_weight.estimation # Override AdaNode def get_error_width(self): w = 0.0 if self.is_null_error() is False: w = self._estimation_error_weight.width return w # Override AdaNode def is_null_error(self): return self._estimation_error_weight is None # Override AdaNode def learn_from_instance(self, X, y, weight, rhat, parent, parent_branch): true_target = y normalized_error = 0.0 if self.filter_instance_to_leaf(X, parent, parent_branch).node is not None: target_prediction = rhat.predict([X])[0] normalized_error = rhat.get_normalized_error( target_prediction, true_target) if self._estimation_error_weight is None: self._estimation_error_weight = ADWIN() old_error = self.get_error_estimation() # Add element to Change detector self._estimation_error_weight.add_element(normalized_error) # Detect change self.error_change = self._estimation_error_weight.detected_change() if self.error_change is True and old_error > self.get_error_estimation( ): self.error_change = False # Check condition to build a new alternate tree if self.error_change is True: self._alternate_tree = rhat._new_learning_node() rhat.alternate_trees_cnt += 1 # Condition to replace alternate tree elif self._alternate_tree is not None and self._alternate_tree.is_null_error( ) is False: if self.get_error_width() > ERROR_WIDTH_THRESHOLD \ and self._alternate_tree.get_error_width() > ERROR_WIDTH_THRESHOLD: old_error_rate = self.get_error_estimation() alt_error_rate = self._alternate_tree.get_error_estimation() fDelta = .05 fN = 1.0 / self._alternate_tree.get_error_width() + 1.0 / ( self.get_error_width()) bound = math.sqrt(2.0 * old_error_rate * (1.0 - old_error_rate) * math.log(2.0 / fDelta) * fN) # To check, bound never less than (old_error_rate - alt_error_rate) if bound < (old_error_rate - alt_error_rate): rhat._active_leaf_node_cnt -= self.number_leaves() rhat._active_leaf_node_cnt += self._alternate_tree.number_leaves( ) self.kill_tree_children(rhat) if parent is not None: parent.set_child(parent_branch, self._alternate_tree) else: rhat._tree_root = rhat._tree_root._alternate_tree rhat.switch_alternate_trees_cnt += 1 elif bound < alt_error_rate - old_error_rate: if isinstance(self._alternate_tree, ActiveLearningNode): self._alternate_tree = None elif isinstance(self._alternate_tree, ActiveLearningNode): self._alternate_tree = None else: self._alternate_tree.kill_tree_children(rhat) rhat.pruned_alternate_trees_cnt += 1 # hat.pruned_alternate_trees_cnt to check # Learn_From_Instance alternate Tree and Child nodes if self._alternate_tree is not None: self._alternate_tree.learn_from_instance(X, y, weight, rhat, parent, parent_branch) child_branch = self.instance_child_index(X) child = self.get_child(child_branch) if child is not None: child.learn_from_instance(X, y, weight, rhat, self, child_branch) # Instance contains a categorical value previously unseen by the split # node elif isinstance(self.get_split_test(), NominalAttributeMultiwayTest) and \ self.get_split_test().branch_for_instance(X) < 0: # Creates a new learning node to encompass the new observed feature # value leaf_node = rhat._new_learning_node() branch_id = self.get_split_test().add_new_branch( X[self.get_split_test().get_atts_test_depends_on()[0]]) self.set_child(branch_id, leaf_node) rhat._active_leaf_node_cnt += 1 leaf_node.learn_from_instance(X, y, weight, rhat, parent, parent_branch) # Override AdaNode def kill_tree_children(self, rhat): for child in self._children: if child is not None: # Delete alternate tree if it exists if isinstance(child, rhat.AdaSplitNodeForRegression ) and child._alternate_tree is not None: self._pruned_alternate_trees += 1 # Recursive delete of SplitNodes if isinstance(child, rhat.AdaSplitNodeForRegression): child.kill_tree_children(rhat) if isinstance(child, ActiveLearningNode): child = None rhat._active_leaf_node_cnt -= 1 elif isinstance(child, InactiveLearningNode): child = None rhat._inactive_leaf_node_cnt -= 1 # override AdaNode def filter_instance_to_leaves(self, X, y, weight, parent, parent_branch, update_splitter_counts=False, found_nodes=None): if found_nodes is None: found_nodes = [] if update_splitter_counts: try: self._observed_class_distribution[0] += weight self._observed_class_distribution[1] += y * weight self._observed_class_distribution[2] += y * y * weight except KeyError: self._observed_class_distribution[0] = weight self._observed_class_distribution[1] = y * weight self._observed_class_distribution[2] = y * y * weight child_index = self.instance_child_index(X) if child_index >= 0: child = self.get_child(child_index) if child is not None: child.filter_instance_to_leaves(X, y, weight, parent, parent_branch, update_splitter_counts, found_nodes) else: found_nodes.append(FoundNode(None, self, child_index)) if self._alternate_tree is not None: self._alternate_tree.filter_instance_to_leaves( X, y, weight, self, -999, update_splitter_counts, found_nodes)
class AdaLearningNodeForRegression(LearningNodePerceptron, NewNode): def __init__(self, initial_class_observations, perceptron_weight, random_state=None): super().__init__(initial_class_observations, perceptron_weight, random_state) self._estimation_error_weight = ADWIN() self._error_change = False self._randomSeed = 1 self._classifier_random = check_random_state(self._randomSeed) def calc_byte_size(self): byte_size = self.__sizeof__() if self._estimation_error_weight is not None: byte_size += self._estimation_error_weight.get_length_estimation( ) return byte_size # Override NewNode def number_leaves(self): return 1 # Override NewNode def get_error_estimation(self): return self._estimation_error_weight.estimation # Override NewNode def get_error_width(self): return self._estimation_error_weight.width # Override NewNode def is_null_error(self): return self._estimation_error_weight is None def kill_tree_children(self, hat): pass # Override NewNode def learn_from_instance(self, X, y, weight, rhat, parent, parent_branch): super().learn_from_instance(X, y, weight, rhat) true_target = y target_prediction = rhat.predict([X])[0] normalized_error = rhat.get_normalized_error( target_prediction, true_target) if self._estimation_error_weight is None: self._estimation_error_weight = ADWIN() old_error = self.get_error_estimation() # Add element to Adwin self._estimation_error_weight.add_element(normalized_error) # Detect change with Adwin self._error_change = self._estimation_error_weight.detected_change( ) if self._error_change is True and old_error > self.get_error_estimation( ): self._error_change = False # call ActiveLearningNode weight_seen = self.get_weight_seen() if weight_seen - self.get_weight_seen_at_last_split_evaluation( ) >= rhat.grace_period: rhat._attempt_to_split(self, parent, parent_branch) self.set_weight_seen_at_last_split_evaluation(weight_seen) # Override NewNode, New for option votes def filter_instance_to_leaves(self, X, y, weight, parent, parent_branch, update_splitter_counts, found_nodes=None): if found_nodes is None: found_nodes = [] found_nodes.append( HoeffdingTree.FoundNode(self, parent, parent_branch))
end_test_date = start_test_date + pd.DateOffset(months = 3) X_train, y_train, X_test, y_test = xgboost_model.generate_data(data, start_train_date, end_train_date, start_test_date, end_test_date, verbose=1) if training_flag: xgboost_model.fit_model(X_train, y_train) results_dict = xgboost_model.compute_predictions(X_test, y_test) label_transformed = transform_label(results_dict['y_true'][-1], results_dict['Predictions'][-1]) temp_drifts = [] for i in range(label_transformed.shape[0]): adwin.add_element(label_transformed['conversion'].iloc[i]) if adwin.detected_change(): print('Change detected ADWIN in data: ' + str(results_dict['y_true'][-1].iloc[i]) + ' - at date: ' + str(results_dict['Date'][-1].iloc[i])) temp_drifts.append(results_dict['Date'][-1].iloc[i]) adwin.reset() if not temp_drifts: print('No Drift Detected - Predict next three months') start_test_date = start_test_date + pd.DateOffset(months = 3) training_flag = False if temp_drifts: print('Drift detected - Retrain model') list_drift.append(temp_drifts[0]) start_train_date = temp_drifts[0] - pd.DateOffset(years = 2)
# Simulate a data stream of size 1000 from a Standard normal distribution stream = np.random.randn(1000) stream[:10] # Output: #array([-1.0856306 , 0.99734545, 0.2829785 , -1.50629471, -0.57860025, # 1.65143654, -2.42667924, -0.42891263, 1.26593626, -0.8667404 ]) # Data concept are changed from index 599 to 999 for j in range(599, 1000): stream[j] = np.random.randint(5, high=9) # Stream elements are added to ADWIN and checking whether drift occured for j in range(1000): A.add_element(stream[j]) if A.detected_change(): print('Concept Drift detected in data: ' + str(stream[j]) + ' - at index: ' + str(j)) ### Output: #Concept Drift detected in data: 8.0 - at index: 607 #Concept Drift detected in data: 5.0 - at index: 639 #Concept Drift detected in data: 6.0 - at index: 671 ######## ### DDM code import numpy as np from skmultiflow.drift_detection import DDM # call the DDM object
class AdaSplitNodeRegressor(AdaSplitNode): """ Node that splits the data in a Hoeffding Adaptive Tree regressor. Parameters ---------- split_test: skmultiflow.split_test.InstanceConditionalTest Split test. stats: dict In regression tasks this dictionary carries the sufficient to perform online variance calculation. They refer to the number of observations (key '0'), the sum of the target values (key '1'), and the sum of the squared target values (key '2'). random_state: int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. """ def __init__(self, split_test, stats=None, random_state=None): super().__init__(split_test, stats, random_state) # Normalization of info monitored by drift detectors (using Welford's algorithm) self._n = 0 # Override AdaSplitNode def learn_one(self, X, y, weight, tree, parent, parent_branch): normalized_error = 0.0 leaf = self.filter_instance_to_leaf(X, parent, parent_branch).node if leaf is not None: y_pred = leaf.predict_one(X, tree=tree) normalized_error = get_normalized_error(y, y_pred, self) if self._adwin is None: self._adwin = ADWIN() old_error = self.error_estimation # Add element to change detector self._adwin.add_element(normalized_error) # Detect change self.error_change = self._adwin.detected_change() if self.error_change and old_error > self.error_estimation: self.error_change = False # Check condition to build a new alternate tree if self.error_change: self._alternate_tree = tree._new_learning_node() tree.alternate_trees_cnt += 1 # Condition to replace alternate tree elif self._alternate_tree is not None and not self._alternate_tree.error_is_null( ): if self.error_width > tree._ERROR_WIDTH_THRESHOLD \ and self._alternate_tree.error_width > tree._ERROR_WIDTH_THRESHOLD: old_error_rate = self.error_estimation alt_error_rate = self._alternate_tree.error_estimation fDelta = .05 fN = 1.0 / self._alternate_tree.error_width + 1.0 / self.error_width sq_term = 2.0 * old_error_rate * (1.0 - old_error_rate) * math.log(2.0 / fDelta) \ * fN bound = math.sqrt(sq_term) if sq_term > 0 else 0.0 if bound < (old_error_rate - alt_error_rate): tree._active_leaf_node_cnt -= self.n_leaves tree._active_leaf_node_cnt += self._alternate_tree.n_leaves self.kill_tree_children(tree) if parent is not None: parent.set_child(parent_branch, self._alternate_tree) else: tree._tree_root = tree._tree_root._alternate_tree tree.switch_alternate_trees_cnt += 1 elif bound < alt_error_rate - old_error_rate: if isinstance(self._alternate_tree, ActiveLeaf): self._alternate_tree = None elif isinstance(self._alternate_tree, InactiveLeaf): self._alternate_tree = None else: self._alternate_tree.kill_tree_children(tree) tree.pruned_alternate_trees_cnt += 1 # hat.pruned_alternate_trees_cnt to check # Learn one sample in alternate tree and child nodes if self._alternate_tree is not None: self._alternate_tree.learn_one(X, y, weight, tree, parent, parent_branch) child_branch = self.instance_child_index(X) child = self.get_child(child_branch) if child is not None: child.learn_one(X, y, weight, tree, parent=self, parent_branch=child_branch) # Instance contains a categorical value previously unseen by the split node else: # Creates a new learning node to encompass the new observed feature # value leaf_node = tree._new_learning_node() branch_id = self.split_test.add_new_branch( X[self.split_test.get_atts_test_depends_on()[0]]) self.set_child(branch_id, leaf_node) tree._active_leaf_node_cnt += 1 leaf_node.learn_one(X, y, weight, tree, parent, parent_branch) def predict_one(self, X, *, tree=None): # Called in case an emerging categorical feature has no path down the split node to be # sorted return self.stats[1] / self.stats[0] if len(self.stats) > 0 else 0.0 # override AdaNode def filter_instance_to_leaves(self, X, y, weight, parent, parent_branch, update_splitter_counts=False, found_nodes=None): if found_nodes is None: found_nodes = [] if update_splitter_counts: try: self._stats[0] += weight self._stats[1] += y * weight self._stats[2] += y * y * weight except KeyError: self._stats[0] = weight self._stats[1] = y * weight self._stats[2] = y * y * weight child_index = self.instance_child_index(X) if child_index >= 0: child = self.get_child(child_index) if child is not None: child.filter_instance_to_leaves(X, y, weight, parent, parent_branch, update_splitter_counts, found_nodes) else: found_nodes.append(FoundNode(None, self, child_index)) if self._alternate_tree is not None: self._alternate_tree.filter_instance_to_leaves( X, y, weight, self, -999, update_splitter_counts, found_nodes)
class SADWINIsolationForestStream(BaseSKMObject, ClassifierMixin): """ This code implements Anomaly Detection Approach Based on Isolation Forest Algorithm for Streaming Data Using Sliding Window (Ding \& Fei, 2013) [3] Each sample has an anomaly score is computed based on Isolation Forest anomaly based approach [2]. The concept of Isolation forest [1] consists on isolating observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. Model is updated of a Drift has been detected based on a input drift threshold. The drift detection approach is proposed by [2] and works as follow : if the averaged anomaly score between two successive sliding windows is highter than the drift threshold (u), then the previous model is completely discarded and a new model is build as an isolation forest on latest sliding windows stream. Parameters --------- n_estimators: int, optional (default=25) Number of trees in the ensemble. 't' in the original paper. window_size: int, optional (default=100) The window size of the stream. ψ, 'Psi' in the original paper. ## Optional anomaly_threshold: double, optional (default=0.5) The threshold for declaring anomalies. Any instance prediction probability above this threshold will be declared as an anomaly. drift_threshold: double, optional (default=0.5) The threshold for detecting Drift and update the model. If the averaged anomaly score between two successive sliding windows is highter than the threshold (u), then the previous model is completely discarded and a new model is build as an isolation forest on latest sliding windows stream. This parameters is supposed to be know by an expert domain, depending on data set. ## Other Attributes ensemble : Isolation Tree Ensemble Contain an Isolation Tree Ensemble object, current model for IsolationForestStream sample_size : int Number of sample seen since the update anomaly_rate : float Rate of the anomalies in the previous sliding window (AnomalyRate in the original paper iForestASD) prec_window & window : numpy.ndarray of shape (n_samples, self.window_size) The previous and current window of data cpt : int Counter, if the n_estimator is higher than its, it will fit References ---------- [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. “Isolation forest.” Data Mining, 2008. ICDM’08. Eighth IEEE International Conference on. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. “Isolation-based anomaly detection.” ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): self.n_estimators [3] Ding, Zhiguo. (2013) An Anomaly Detection Approach Based on Isolation Forest Algorithm for Streaming Data Using Sliding Window. 12-17. 10.3182/20130902-3-CN-3020.00044. """ def __init__( self, window_size=100, n_estimators=25, anomaly_threshold=0.5, drift_threshold=0.5, random_state=None, version="AnomalyRate", #Parameters for partial model update n_estimators_updated=0.5, updated_randomly=True, #Parameters for NDKSWIN alpha=0.01, data=None, n_dimensions=1, n_tested_samples=0.1, fixed_checked_dimension=False, fixed_checked_sample=False): super().__init__() self.n_estimators = n_estimators self.ensemble = None self.random_state = random_state self.window_size = window_size self.samples_seen = 0 self.anomaly_rate = 0.20 self.anomaly_threshold = anomaly_threshold self.drift_threshold = drift_threshold self.window = None self.prec_window = None self.cpt = 0 self.version = version self.model_update = [ ] #To count the number of times the model have been updated 0 Not updated and 1 updated self.model_update_windows = [ ] #To count the number of times the model have been updated 0 Not updated and 1 updated self.model_update.append( version ) #Initialisation to know the concerned version of IForestASD self.model_update_windows.append( "samples_seen_" + version ) #Initialisation to know the number of data seen in the window self.n_estimators_updated = int( self.n_estimators * n_estimators_updated ) # The percentage of new trees to compute when update on new window if n_estimators_updated <= 0.0 or n_estimators_updated > 1.0: raise ValueError("n_estimators_updated must be > 0 and <= 1") self.updated_randomly = updated_randomly # If we will choose randomly the trees: True for randomly, # False to pick the first (n_estimators- int(n_estimators*n_estimators_updated)) trees self.alpha = alpha self.n_dimensions = n_dimensions self.n_tested_samples = n_tested_samples self.fixed_checked_dimension = fixed_checked_dimension self.fixed_checked_sample = fixed_checked_sample self.first_time_fit = True # TODO Maurras 27112020: Find a way to optimize the use of ADWIN() self.adwin = ADWIN() def partial_fit(self, X, y, classes=None, sample_weight=None): """ Partially (incrementally) fit the model. Parameters ---------- X : numpy.ndarray of shape (n_samples, n_features) The features to train the model. y: numpy.ndarray of shape (n_samples) An array-like with the class labels of all samples in X. classes: None Not used by this method. sample_weight: None Not used by this method. Returns ------- self """ ## get the number of observations number_instances, _ = X.shape if (self.samples_seen == 0): ## ToDo ? Give a sample of self.window_size in attribute of iForest iforest = IsolationTreeEnsemble(self.window_size, self.n_estimators, self.random_state) self.ensemble = iforest for i in range(number_instances): self._partial_fit(X[i], y[i]) return self def _partial_fit(self, X, y): """ Trains the model on samples X and corresponding targets y. Private function where actual training is carried on. Parameters ---------- X: numpy.ndarray of shape (1, n_features) Instance attributes. y: int Class label for sample X. Not used in this implementaion which is Unsupervised """ """ Reshape X and add it to our window if it isn't full. If it's full, give window to our precedent_window. If we are at the end our window, fit if we're learning Check the anomaly score of our window Update if self.anomaly_rate > self.drift_threshold """ X = np.reshape(X, (1, len(X))) if self.samples_seen % self.window_size == 0: ## Update the two windows (precedent one and current windows) self.prec_window = self.window self.window = X else: self.window = np.concatenate((self.window, X)) if self.samples_seen % self.window_size == 0 and self.samples_seen != 0: #Fit the ensemble if it's not empty #if(self.cpt<self.n_estimators): # self.ensemble.fit(self.prec_window) # self.cpt += 1 if self.first_time_fit: #It is the first window self.ensemble.fit(self.prec_window) self.first_time_fit = False elif (self.version == "SADWIN"): #if self.first_time_fit: # from skmultiflow.drift_detection.adwin import ADWIN # adwin = ADWIN() # self.first_time_fit = False #print('start sadwin version') #TODO MAJ Maurras 04112020 : Modify the way to detect the concept drift using the ADWIN() function availlable in scikitMultiflow #from skmultiflow.drift_detection.adwin import ADWIN #adwin = ADWIN() prec_window_scores = self.ensemble.anomaly_score( self.prec_window) #print('Before add element to adwin in SADWIN') #print(prec_window_scores) drift_detected = False #ind = 0 for score in prec_window_scores: #adwin.add_element(prec_window_scores) #print("added score = "+ str(score) + " on index = "+ str(ind)) #print('score[0]') #print(score[0]) #print('score') #print(score) self.adwin.add_element(score[0]) #print('start change detection') if self.adwin.detected_change(): #print('Change detected SADWIN') drift_detected = True #print("Index = "+str(i) +" of the window with data "+ str(self.prec_window[i])) break #ind = ind + 1 if (drift_detected): #print('start model updating') self.model_update.append(1) self.model_update_windows.append(self.samples_seen) self.update_model(self.prec_window) self.adwin.reset() else: self.model_update.append(0) self.model_update_windows.append(self.samples_seen) self.samples_seen += 1 def update_model(self, window): """ Update the model (fit a new isolation forest) if the current anomaly rate (in the previous sliding window) is higher than self.drift_threshold Parameters: window: numpy.ndarray of shape (self.window_size, n_features) Re-Initialize our attributes and our ensemble, fit with the current window """ ## ToDo ? Give a sample of self.window_size in attribute of iForest #MAJ Maurras 03112020 : No, Leave it like that. Must give all the window to tt construct the forest of itrees. self.is_learning_phase_on = True iforest = IsolationTreeEnsemble(self.window_size, self.n_estimators, self.random_state) self.ensemble = iforest self.ensemble.fit(window) #self.nb_update = self.nb_update + 1 print("") print( "The model was updated by training a new iForest with the version : " + self.version) def anomaly_scores_rate(self, window): """ Given a 2D matrix of observations, compute the anomaly rate for all instances in the window and return an anomaly rate of the given window. Parameters : window: numpy.ndarray of shape (self.window_size, n_features) """ score_tab = 2.0**(-1.0 * self.ensemble.path_length(window) / c(len(window))) score = 0 for x in score_tab: if x > self.anomaly_threshold: score += 1 return score / len(score_tab) ''' MAJ : 21112020 By : Maurras Add new function to classify instances (anomaly or normal) ''' def predict_simple(self, X): """ Given a window, Predict the instance class (1 or 0) by using predict_from_instances_scores on our model """ #print('predict_simple') prediction = self.ensemble.predict_from_instances_scores( self.ensemble.anomaly_score(X), self.anomaly_threshold) ## return prediction of all instances #print('end predict_simple') return prediction def predict(self, X): """ Given an instance, Predict the anomaly (1 or 0) based on the last sample of the window by using predict_proba if our model have fit, else return None """ if (self.samples_seen <= self.window_size): return [-1] ## Return the last element X = np.reshape(X, (1, len(X[0]))) self.prec_window = np.concatenate( (self.prec_window, X)) ## Append the instances in the sliding window prediction = self.ensemble.predict_from_anomaly_scores( self.predict_proba(self.prec_window), self.anomaly_threshold) ## return 0 or 1 return [prediction] def predict_proba(self, X): """ Calculate the anomaly score of the window if our model have fit, else return None Parameters : X: numpy.ndarray of shape (self.window_size, n_features) """ if (self.samples_seen <= self.window_size): return [-1] return self.ensemble.anomaly_score( self.prec_window )[-1] # Anomaly return an array with all scores of each data, taking -1 return the last instance (X) anomaly score
class AdaSplitNode(SplitNode, NewNode): def __init__(self, split_test, class_observations): super().__init__(split_test, class_observations) self._estimation_error_weight = ADWIN() self._alternate_tree = None self.error_change = False self._random_seed = 1 self._classifier_random = check_random_state(self._random_seed) # Override NewNode def number_leaves(self): num_of_leaves = 0 for child in self._children: if child is not None: num_of_leaves += child.number_leaves() return num_of_leaves # Override NewNode def get_error_estimation(self): return self._estimation_error_weight.estimation # Override NewNode def get_error_width(self): w = 0.0 if self.is_null_error() is False: w = self._estimation_error_weight.width return w # Override NewNode def is_null_error(self): return self._estimation_error_weight is None # Override NewNode def learn_from_instance(self, X, y, weight, hat, parent, parent_branch): true_class = y class_prediction = 0 leaf = self.filter_instance_to_leaf(X, parent, parent_branch) if leaf.node is not None: class_prediction = get_max_value_key( leaf.node.get_class_votes(X, hat)) bl_correct = (true_class == class_prediction) if self._estimation_error_weight is None: self._estimation_error_weight = ADWIN() old_error = self.get_error_estimation() # Add element to ADWIN add = 0.0 if (bl_correct is True) else 1.0 self._estimation_error_weight.add_element(add) # Detect change with ADWIN self.error_change = self._estimation_error_weight.detected_change() if self.error_change is True and old_error > self.get_error_estimation( ): self.error_change = False # Check condition to build a new alternate tree if self.error_change is True: self._alternate_tree = hat._new_learning_node() hat.alternate_trees_cnt += 1 # Condition to replace alternate tree elif self._alternate_tree is not None and self._alternate_tree.is_null_error( ) is False: if self.get_error_width() > error_width_threshold \ and self._alternate_tree.get_error_width() > error_width_threshold: old_error_rate = self.get_error_estimation() alt_error_rate = self._alternate_tree.get_error_estimation( ) fDelta = .05 fN = 1.0 / self._alternate_tree.get_error_width() + 1.0 / ( self.get_error_width()) bound = math.sqrt(2.0 * old_error_rate * (1.0 - old_error_rate) * math.log(2.0 / fDelta) * fN) # To check, bound never less than (old_error_rate - alt_error_rate) if bound < (old_error_rate - alt_error_rate): hat._active_leaf_node_cnt -= self.number_leaves() hat._active_leaf_node_cnt += self._alternate_tree.number_leaves( ) self.kill_tree_children(hat) if parent is not None: parent.set_child(parent_branch, self._alternate_tree) else: # Switch tree root hat._tree_root = hat._tree_root.alternateTree hat.switch_alternate_trees_cnt += 1 elif bound < alt_error_rate - old_error_rate: if isinstance( self._alternate_tree, GRF_HoeffdingAdaptiveTree.ActiveLearningNode): self._alternate_tree = None elif isinstance( self._alternate_tree, GRF_HoeffdingAdaptiveTree.InactiveLearningNode ): self._alternate_tree = None else: self._alternate_tree.kill_tree_children(hat) hat.pruned_alternate_trees_cnt += 1 # hat.pruned_alternate_trees_cnt to check # Learn_From_Instance alternate Tree and Child nodes if self._alternate_tree is not None: self._alternate_tree.learn_from_instance( X, y, weight, hat, parent, parent_branch) child_branch = self.instance_child_index(X) child = self.get_child(child_branch) if child is not None: child.learn_from_instance(X, y, weight, hat, parent, parent_branch) # Override NewNode def kill_tree_children(self, hat): for child in self._children: if child is not None: # Delete alternate tree if it exists if isinstance(child, GRF_HoeffdingAdaptiveTree.AdaSplitNode ) and child._alternate_tree is not None: child._alternate_tree.kill_tree_children(hat) self._pruned_alternate_trees += 1 # Recursive delete of SplitNodes if isinstance(child, GRF_HoeffdingAdaptiveTree.AdaSplitNode): child.kill_tree_children(hat) if isinstance( child, GRF_HoeffdingAdaptiveTree.ActiveLearningNode): child = None hat._active_leaf_node_cnt -= 1 elif isinstance( child, GRF_HoeffdingAdaptiveTree.InactiveLearningNode): child = None hat._inactive_leaf_node_cnt -= 1 # override NewNode def filter_instance_to_leaves(self, X, y, weight, parent, parent_branch, update_splitter_counts=False, found_nodes=None): if found_nodes is None: found_nodes = [] if update_splitter_counts: try: self._observed_class_distribution[ y] += weight # Dictionary (class_value, weight) except KeyError: self._observed_class_distribution[y] = weight child_index = self.instance_child_index(X) if child_index >= 0: child = self.get_child(child_index) if child is not None: child.filter_instance_to_leaves(X, y, weight, parent, parent_branch, update_splitter_counts, found_nodes) else: found_nodes.append( HoeffdingTree.FoundNode(None, self, child_index)) if self._alternate_tree is not None: self._alternate_tree.filter_instance_to_leaves( X, y, weight, self, -999, update_splitter_counts, found_nodes)
# Purpose: Detecting concept drifts of case execution time to decide the period of training set. import pandas as pd from matplotlib import pyplot as plt from skmultiflow.drift_detection.adwin import ADWIN df = pd.read_csv('data/bpic2012_cet.csv') # drift detection adwin = ADWIN() drift_ind = [] for idx, row in df.iterrows(): cet = row['case_execution_time_seconds'] adwin.add_element(cet) if adwin.detected_change(): print('Change detected in data: ' + str(cet) + ' - at index: ' + str(idx)) drift_ind.append(idx) plt.plot(df['case_execution_time_seconds']) for i in drift_ind: plt.axvline(i, color='black', linestyle='--', linewidth=1) plt.show()
class AdaLearningNodeForRegression(ActiveLearningNodePerceptron, AdaNode): """ Learning Node of the Regression Hoeffding Adaptive Tree that always use a linear perceptron model to provide responses. Parameters ---------- initial_class_observations: dict In regression tasks this dictionary carries the sufficient to perform online variance calculation. They refer to the number of observations (key '0'), the sum of the target values (key '1'), and the sum of the squared target values (key '2'). parent_node: AdaLearningNodeForRegression (default=None) A node containing statistics about observed data. random_state: int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. """ def __init__(self, initial_class_observations, parent_node, random_state=None): super().__init__(initial_class_observations, parent_node, random_state) self._estimation_error_weight = ADWIN() self._error_change = False # To normalize the observed errors in the [0, 1] range self._min_error = float('Inf') self._max_error = float('-Inf') # Override AdaNode def number_leaves(self): return 1 # Override AdaNode def get_error_estimation(self): return self._estimation_error_weight.estimation # Override AdaNode def get_error_width(self): return self._estimation_error_weight.width # Override AdaNode def is_null_error(self): return self._estimation_error_weight is None def kill_tree_children(self, hat): pass # Override AdaNode def learn_from_instance(self, X, y, weight, rhat, parent, parent_branch): super().learn_from_instance(X, y, weight, rhat) y_pred = rhat.predict([X])[0] normalized_error = self.get_normalized_error(y, y_pred) if self._estimation_error_weight is None: self._estimation_error_weight = ADWIN() old_error = self.get_error_estimation() # Add element to Adwin self._estimation_error_weight.add_element(normalized_error) # Detect change with Adwin self._error_change = self._estimation_error_weight.detected_change() if self._error_change and old_error > self.get_error_estimation(): self._error_change = False # call ActiveLearningNode weight_seen = self.get_weight_seen() if weight_seen - self.get_weight_seen_at_last_split_evaluation( ) >= rhat.grace_period: rhat._attempt_to_split(self, parent, parent_branch) self.set_weight_seen_at_last_split_evaluation(weight_seen) # Override AdaNode, New for option votes def filter_instance_to_leaves(self, X, y, weight, parent, parent_branch, update_splitter_counts, found_nodes=None): if found_nodes is None: found_nodes = [] found_nodes.append(FoundNode(self, parent, parent_branch)) def get_normalized_error(self, y, y_pred): abs_error = abs(y - y_pred) # Incremental maintenance of the normalization ranges if abs_error < self._min_error: self._min_error = abs_error if abs_error > self._max_error: self._max_error = abs_error if self._min_error != self._max_error: return (abs_error - self._min_error) / (self._max_error - self._min_error) else: return 0.0
eda_discretizado = discretizar(ppa_result) plt.plot(eda_discretizado,'k-',color='blue', label="Discretizado (SAX)") plt.title("Señal EDA (Discretizado)") plt.xlabel("Tiempo (Minutos)") plt.ylabel("µS") plt.legend(loc="upper right") #plt.show() #4) Deteccion de cambio (clasificacion) from skmultiflow.drift_detection.adwin import ADWIN adwin = ADWIN(delta = 0.01) cambios_detectados_x = [] cambios_detectados_y = [] for i in range(len(eda_discretizado)): adwin.add_element(eda_discretizado[i]) if adwin.detected_change(): print('Change detected in data: ' + str(eda_discretizado[i]) + ' - at index: ' + str(i)) cambios_detectados_x.append(i) cambios_detectados_y.append(eda_discretizado[i]) plt.plot(cambios_detectados_x,cambios_detectados_y, 'x', label="Cambio detectado" ) plt.legend(loc="upper right") plt.show()
# Add date of updating to update_dates xgboost_model.results['Update'].append(end_train_date) results_dict = xgboost_model.compute_predictions(X_test, y_test) temp_drifts = [] df_results = pd.DataFrame({ 'y_true': results_dict['y_true'][-1], 'y_pred': results_dict['Predictions'][-1] }) df_results['Correct'] = (df_results['y_true'] == df_results['y_pred']) for i in range(df_results.shape[0]): adwin.add_element(df_results['Correct'].iloc[i]) if adwin.detected_change(): print('Change detected ADWIN in data: ' + str(df_results['Correct'].iloc[i]) + ' - at date: ' + str(results_dict['Date'][-1].iloc[i])) temp_drifts.append(results_dict['Date'][-1].iloc[i]) adwin.reset() if not temp_drifts: print('No Drift Detected - Predict next three months') start_test_date = start_test_date + pd.DateOffset(months=3) training_flag = False update_flag = False if temp_drifts: print('Drift detected - Choice on model')
class KNNAdwin(KNN): """ K-Nearest Neighbors Classifier with ADWIN Change detector This Classifier is an improvement from the regular KNN classifier, as it is resistant to concept drift. It utilises the ADWIN change detector to decide which samples to keep and which ones to forget, and by doing so it regulates the sample window size. To know more about the ADWIN change detector, please visit skmultiflow.classification.core.drift_detection.adwin It uses the regular KNN Classifier as a base class, with the major difference that this class keeps a variable size window, instead of a fixed size one and also it updates the adwin algorithm at each partial_fit call. Parameters ---------- k: int The number of nearest neighbors to search for. max_window_size: int The maximum size of the window storing the last viewed samples. leaf_size: int The maximum number of samples that can be stored in one leaf node, which determines from which point the algorithm will switch for a brute-force approach. The bigger this number the faster the tree construction time, but the slower the query time will be. categorical_list: An array-like Each entry is the index of a categorical feature. May be requested further filtering. Raises ------ NotImplementedError: A few of the functions described here are not implemented since they have no application in this context. ValueError: A ValueError is raised if the predict function is called before at least k samples have been analyzed by the algorithm. Examples -------- >>> # Imports >>> from skmultiflow.classification.lazy.knn_adwin import KNNAdwin >>> from skmultiflow.classification.lazy.knn import KNN >>> from skmultiflow.data.file_stream import FileStream >>> # Setting up the stream >>> stream = FileStream('skmultiflow/datasets/covtype.csv', -1, 1) >>> stream.prepare_for_use() >>> # Setting up the KNNAdwin classifier >>> knn_adwin = KNNAdwin(k=8, leaf_size=40, max_window_size=2000) >>> # Pre training the classifier with 200 samples >>> X, y = stream.next_sample(200) >>> knn_adwin = knn_adwin.partial_fit(X, y) >>> # Keeping track of sample count and correct prediction count >>> n_samples = 0 >>> corrects = 0 >>> while n_samples < 5000: ... X, y = stream.next_sample() ... pred = knn_adwin.predict(X) ... if y[0] == pred[0]: ... corrects += 1 ... knn_adwin = knn_adwin.partial_fit(X, y) ... n_samples += 1 >>> >>> # Displaying the results >>> print('KNN usage example') >>> print(str(n_samples) + ' samples analyzed.') 5000 samples analyzed. >>> print("KNNAdwin's performance: " + str(corrects/n_samples)) KNNAdwin's performance: 0.7798 """ def __init__(self, k=5, max_window_size=sys.maxsize, leaf_size=30, categorical_list=[]): super().__init__(k=k, max_window_size=max_window_size, leaf_size=leaf_size, categorical_list=categorical_list) self.adwin = ADWIN() self.window = None def reset(self): """ reset Resets the adwin algorithm as well as the base model kept by the KNN base class. Returns ------- KNNAdwin self """ self.adwin = ADWIN() return super().reset() def partial_fit(self, X, y, classes=None, weight=None): """ partial_fit Partially fits the model. This is done by updating the window with new samples while also updating the adwin algorithm. Then we verify if a change was detected, and if so, the window is correctly split at the drift moment. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: Not used. weight: Not used. Returns ------- KNNAdwin self """ r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.max_window_size) for i in range(r): if r > 1: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) else: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) if self.window._num_samples >= self.k: add = 1 if self.predict(np.asarray([X[i]])) == y[i] else 0 self.adwin.add_element(add) else: self.adwin.add_element(0) if self.window._num_samples >= self.k: changed = self.adwin.detected_change() if changed: if self.adwin._width < self.window._num_samples: for i in range(self.window._num_samples, self.adwin._width, -1): self.window.delete_element() return self
class AdaLearningNode(LearningNodeNBAdaptive, NewNode): def __init__(self, initial_class_observations): super().__init__(initial_class_observations) self._estimation_error_weight = ADWIN() self.error_change = False self._randomSeed = 1 self._classifier_random = check_random_state(self._randomSeed) # Override NewNode def number_leaves(self): return 1 # Override NewNode def get_error_estimation(self): return self._estimation_error_weight.estimation # Override NewNode def get_error_width(self): return self._estimation_error_weight.width # Override NewNode def is_null_error(self): return self._estimation_error_weight is None def kill_tree_children(self, hat): pass # Override NewNode def learn_from_instance(self, X, y, weight, hat, parent, parent_branch): true_class = y # k = self._classifier_random.poisson(1.0) # if k > 0: # weight = weight * k tmp = self.get_class_votes(X, hat) class_prediction = get_max_value_key(tmp) bl_correct = (true_class == class_prediction) if self._estimation_error_weight is None: self._estimation_error_weight = ADWIN() old_error = self.get_error_estimation() # Add element to Adwin add = 0.0 if (bl_correct is True) else 1.0 self._estimation_error_weight.add_element(add) # Detect change with Adwin self.error_change = self._estimation_error_weight.detected_change() if self.error_change is True and old_error > self.get_error_estimation( ): self.error_change = False # Update statistics super().learn_from_instance(X, y, weight, hat) # call ActiveLearningNode weight_seen = self.get_weight_seen() if weight_seen - self.get_weight_seen_at_last_split_evaluation( ) >= hat.grace_period: hat._attempt_to_split(self, parent, parent_branch) self.set_weight_seen_at_last_split_evaluation(weight_seen) # Override LearningNodeNBAdaptive def get_class_votes(self, X, ht): # dist = {} prediction_option = ht.leaf_prediction # MC if prediction_option == MAJORITY_CLASS: dist = self.get_observed_class_distribution() # NB elif prediction_option == NAIVE_BAYES: dist = do_naive_bayes_prediction( X, self._observed_class_distribution, self._attribute_observers) # NBAdaptive else: if self._mc_correct_weight > self._nb_correct_weight: dist = self.get_observed_class_distribution() else: dist = do_naive_bayes_prediction( X, self._observed_class_distribution, self._attribute_observers) dist_sum = sum(dist.values()) # sum all values in dictionary normalization_factor = dist_sum * self.get_error_estimation( ) * self.get_error_estimation() if normalization_factor > 0.0: normalize_values_in_dict(dist, normalization_factor) return dist # Override NewNode, New for option votes def filter_instance_to_leaves(self, X, y, weight, parent, parent_branch, update_splitter_counts, found_nodes=None): if found_nodes is None: found_nodes = [] found_nodes.append( HoeffdingTree.FoundNode(self, parent, parent_branch))
class AdaLearningNodeForRegression(ActiveLearningNodePerceptron, AdaNode): """ Learning Node of the Regression Hoeffding Adaptive Tree that always use a linear perceptron model to provide responses. Parameters ---------- initial_class_observations: dict In regression tasks this dictionary carries the sufficient to perform online variance calculation. They refer to the number of observations (key '0'), the sum of the target values (key '1'), and the sum of the squared target values (key '2'). perceptron_weight: np.ndarray(n_features) or None, optional (default=None) (default=None) The weights for the linear models. If not passed, uniform values in the range [-1, 1] are used. random_state: int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. """ def __init__(self, initial_class_observations, perceptron_weight, random_state=None): super().__init__(initial_class_observations, perceptron_weight, random_state) self._estimation_error_weight = ADWIN() self._error_change = False self._randomSeed = 1 self._classifier_random = check_random_state(self._randomSeed) # Override AdaNode def number_leaves(self): return 1 # Override AdaNode def get_error_estimation(self): return self._estimation_error_weight.estimation # Override AdaNode def get_error_width(self): return self._estimation_error_weight.width # Override AdaNode def is_null_error(self): return self._estimation_error_weight is None def kill_tree_children(self, hat): pass # Override AdaNode def learn_from_instance(self, X, y, weight, rhat, parent, parent_branch): super().learn_from_instance(X, y, weight, rhat) true_target = y target_prediction = rhat.predict([X])[0] normalized_error = rhat.get_normalized_error(target_prediction, true_target) if self._estimation_error_weight is None: self._estimation_error_weight = ADWIN() old_error = self.get_error_estimation() # Add element to Adwin self._estimation_error_weight.add_element(normalized_error) # Detect change with Adwin self._error_change = self._estimation_error_weight.detected_change() if self._error_change is True and old_error > self.get_error_estimation( ): self._error_change = False # call ActiveLearningNode weight_seen = self.get_weight_seen() if weight_seen - self.get_weight_seen_at_last_split_evaluation( ) >= rhat.grace_period: rhat._attempt_to_split(self, parent, parent_branch) self.set_weight_seen_at_last_split_evaluation(weight_seen) # Override AdaNode, New for option votes def filter_instance_to_leaves(self, X, y, weight, parent, parent_branch, update_splitter_counts, found_nodes=None): if found_nodes is None: found_nodes = [] found_nodes.append(FoundNode(self, parent, parent_branch))
gdf_ais.within(port_area.values[0].envelope), :] gdf_aisByHourPortOfInterest = [] for hour in uniqueHours: gdf_aisByHourPortOfInterest.append(signalsWithinPortArea.loc[ signalsWithinPortArea['hour'] == hour].shape[0]) hours = range(0, len(uniqueHours), 1) plt.plot(hours, gdf_aisByHourPortOfInterest) plt.show() ## Q5 print( "******************************* Q5 *************************************") ## reference - lab 7 import numpy as np from skmultiflow.drift_detection.adwin import ADWIN adwin = ADWIN() for i in hours: adwin.add_element(gdf_aisByHourPortOfInterest[i]) if adwin.detected_change(): print('Change detected in data: ' + str(gdf_aisByHourPortOfInterest[i]) + ' - at index: ' + str(i)) ## Q6 print( "******************************* Q6 *************************************") # Clustering ports based on message density. We are using data from Q1 where from sklearn.cluster import DBSCAN
class AdaSplitNodeForRegression(SplitNode, NewNode): def __init__(self, split_test, class_observations): super().__init__(split_test, class_observations) self._estimation_error_weight = ADWIN() self._alternate_tree = None self.error_change = False self._random_seed = 1 self._classifier_random = check_random_state(self._random_seed) # Override SplitNode def calc_byte_size_including_subtree(self): byte_size = self.__sizeof__() if self._alternate_tree is not None: byte_size += self._alternate_tree.calc_byte_size_including_subtree( ) if self._estimation_error_weight is not None: byte_size += self._estimation_error_weight.get_length_estimation( ) for child in self._children: if child is not None: byte_size += child.calc_byte_size_including_subtree() return byte_size # Override NewNode def number_leaves(self): num_of_leaves = 0 for child in self._children: if child is not None: num_of_leaves += child.number_leaves() return num_of_leaves # Override NewNode def get_error_estimation(self): return self._estimation_error_weight.estimation # Override NewNode def get_error_width(self): w = 0.0 if self.is_null_error() is False: w = self._estimation_error_weight.width return w # Override NewNode def is_null_error(self): return self._estimation_error_weight is None # Override NewNode def learn_from_instance(self, X, y, weight, rhat, parent, parent_branch): true_target = y normalized_error = 0.0 if self.filter_instance_to_leaf(X, parent, parent_branch).node is not None: target_prediction = rhat.predict([X])[0] normalized_error = rhat.get_normalized_error( target_prediction, true_target) if self._estimation_error_weight is None: self._estimation_error_weight = ADWIN() old_error = self.get_error_estimation() # Add element to Change detector self._estimation_error_weight.add_element(normalized_error) # Detect change self.error_change = self._estimation_error_weight.detected_change() if self.error_change is True and old_error > self.get_error_estimation( ): self.error_change = False # Check condition to build a new alternate tree if self.error_change is True: self._alternate_tree = rhat._new_learning_node() rhat.alternate_trees_cnt += 1 # Condition to replace alternate tree elif self._alternate_tree is not None and self._alternate_tree.is_null_error( ) is False: print("we'll be replacing the actual tree") if self.get_error_width() > error_width_threshold \ and self._alternate_tree.get_error_width() > error_width_threshold: old_error_rate = self.get_error_estimation() alt_error_rate = self._alternate_tree.get_error_estimation( ) fDelta = .05 fN = 1.0 / self._alternate_tree.get_error_width() + 1.0 / ( self.get_error_width()) bound = math.sqrt(2.0 * old_error_rate * (1.0 - old_error_rate) * math.log(2.0 / fDelta) * fN) # To check, bound never less than (old_error_rate - alt_error_rate) if bound < (old_error_rate - alt_error_rate): rhat._active_leaf_node_cnt -= self.number_leaves() rhat._active_leaf_node_cnt += self._alternate_tree.number_leaves( ) self.kill_tree_children(rhat) if parent is not None: parent.set_child(parent_branch, self._alternate_tree) else: rhat._tree_root = rhat._tree_root._alternate_tree rhat.switch_alternate_trees_cnt += 1 elif bound < alt_error_rate - old_error_rate: if isinstance(self._alternate_tree, HoeffdingTree.ActiveLearningNode): self._alternate_tree = None elif isinstance(self._alternate_tree, HoeffdingTree.ActiveLearningNode): self._alternate_tree = None else: self._alternate_tree.kill_tree_children(rhat) rhat.pruned_alternate_trees_cnt += 1 # hat.pruned_alternate_trees_cnt to check # Learn_From_Instance alternate Tree and Child nodes if self._alternate_tree is not None: self._alternate_tree.learn_from_instance( X, y, weight, rhat, parent, parent_branch) child_branch = self.instance_child_index(X) child = self.get_child(child_branch) if child is not None: child.learn_from_instance(X, y, weight, rhat, parent, parent_branch) # Override NewNode def kill_tree_children(self, rhat): for child in self._children: if child is not None: # Delete alternate tree if it exists if isinstance(child, rhat.AdaSplitNodeForRegression ) and child._alternate_tree is not None: self._pruned_alternate_trees += 1 # Recursive delete of SplitNodes if isinstance(child, rhat.AdaSplitNodeForRegression): child.kill_tree_children(rhat) if isinstance(child, HoeffdingTree.ActiveLearningNode): child = None rhat._active_leaf_node_cnt -= 1 elif isinstance(child, HoeffdingTree.InactiveLearningNode): child = None rhat._inactive_leaf_node_cnt -= 1 # override NewNode def filter_instance_to_leaves(self, X, y, weight, parent, parent_branch, update_splitter_counts=False, found_nodes=None): if found_nodes is None: found_nodes = [] if update_splitter_counts: try: self._observed_class_distribution[0] += weight self._observed_class_distribution[1] += y * weight self._observed_class_distribution[2] += y * y * weight except KeyError: self._observed_class_distribution[0] = weight self._observed_class_distribution[1] = y * weight self._observed_class_distribution[2] = y * y * weight child_index = self.instance_child_index(X) if child_index >= 0: child = self.get_child(child_index) if child is not None: child.filter_instance_to_leaves(X, y, weight, parent, parent_branch, update_splitter_counts, found_nodes) else: found_nodes.append( HoeffdingTree.FoundNode(None, self, child_index)) if self._alternate_tree is not None: self._alternate_tree.filter_instance_to_leaves( X, y, weight, self, -999, update_splitter_counts, found_nodes)
def _partial_fit(self, X, y): """ Trains the model on samples X and corresponding targets y. Private function where actual training is carried on. Parameters ---------- X: numpy.ndarray of shape (1, n_features) Instance attributes. y: int Class label for sample X. Not used in this implementaion which is Unsupervised """ """ Reshape X and add it to our window if it isn't full. If it's full, give window to our precedent_window. If we are at the end our window, fit if we're learning Check the anomaly score of our window Update if self.anomaly_rate > self.drift_threshold """ X = np.reshape(X,(1,len(X))) if self.samples_seen % self.window_size == 0: ## Update the two windows (precedent one and current windows) self.prec_window = self.window self.window = X else: self.window = np.concatenate((self.window,X)) if self.samples_seen % self.window_size == 0 and self.samples_seen !=0: #Fit the ensemble if it's not empty if(self.cpt<self.n_estimators): self.ensemble.fit(self.prec_window) self.cpt += 1 if(self.version == "AnomalyRate"): ## Update the current anomaly score self.anomaly_rate = self.anomaly_scores_rate(self.prec_window) ## Anomaly rate #print(self.anomaly_rate) ## ## Update the model if the anomaly rate is greater than the threshold (u in the original paper [3]) if self.anomaly_rate > self.drift_threshold: ## Use Anomaly RATE ? self.update_model(self.prec_window) # This function will discard completly the old model and create a new one elif(self.version == "ADWIN"): print('start adwin version') #TODO MAJ Maurras 04112020 : Modify the way to detect the concept drift using the ADWIN() function availlable in scikitMultiflow from skmultiflow.drift_detection.adwin import ADWIN adwin = ADWIN() prec_window_scores = self.ensemble.anomaly_score(self.prec_window) #print(prec_window_scores) print('Before add element to adwin') drift_detected = False ind = 0 for score in prec_window_scores: #adwin.add_element(prec_window_scores) print("added score = "+ str(score) + " on index = "+ str(ind)) adwin.add_element(score) print('start change detection') if adwin.detected_change(): print('Change detected on index = '+ str(ind)) drift_detected = True #print("Index = "+str(i) +" of the window with data "+ str(self.prec_window[i])) break; ind = ind + 1 if(drift_detected): print('start model updating') self.update_model(self.prec_window) self.samples_seen += 1
import numpy as np from skmultiflow.drift_detection.adwin import ADWIN from skmultiflow.drift_detection.eddm import EDDM import matplotlib.pyplot as plt import math from random import gauss adwin = ADWIN() # eddm = EDDM() # Simulating a data stream as a normal distribution of 1's and 0's my_mean = 10 my_variance = 0.1 data_stream = [gauss(my_mean, math.sqrt(my_variance)) for i in range(500)] # data_stream = np.random.randint(10, size=100) # data_stream = [1,0,1,0,1,0,1,1,0,1,1,1,1,1,1] # Changing the data concept from index 999 to 2000 # for i in range(50, 100): # data_stream[i] = np.random.randint(4, high=8) # Adding stream elements to ADWIN and verifying if drift occurred print(np.mean(data_stream)) plt.plot(data_stream) plt.show() for i in range(len(data_stream)): adwin.add_element(data_stream[i]) if adwin.detected_change(): print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i))