def test_page_hinkley(test_path): """ ADWIN drift detection test. The first half of the stream contains a sequence corresponding to a normal distribution of integers from 0 to 1. From index 999 to 1999 the sequence is a normal distribution of integers from 0 to 7. """ ph = PageHinkley() test_file = os.path.join(test_path, 'drift_stream.npy') data_stream = np.load(test_file) expected_indices = [ 28, 57, 86, 115, 145, 174, 203, 232, 262, 292, 322, 352, 382, 411, 441, 471, 500, 530, 560, 589, 618, 648, 678, 708, 737, 767, 796, 826, 856, 885, 914, 943, 973, 1002, 1031, 1060, 1090, 1120, 1150, 1179, 1208, 1237, 1266, 1295, 1325, 1354, 1383, 1413, 1443, 1472, 1502, 1532, 1562, 1591, 1620, 1649, 1678, 1708, 1738, 1768, 1798, 1828, 1857, 1887, 1916, 1946, 1975 ] detected_indices = [] for i in range(data_stream.size): ph.add_element(data_stream[i]) if ph.detected_change(): detected_indices.append(i) assert detected_indices == expected_indices
def perform_drift_detection(predict_dataframe, dataframe, feature_names, detector, drift_notification, token="") -> str: log("[INFO] Calling perform_drift_detection", token) log("[INFO] Selected data drift detection method: " + detector) baseline_data = dataframe.values.tolist() predict_data = predict_dataframe.values.tolist() overall_data = list() for a in baseline_data: overall_data.append(a) for b in predict_data: overall_data.append(b) overall_dataframe = pd.DataFrame(overall_data, columns=feature_names) drifts = dict() window = len(baseline_data) for feature in feature_names: detected_drifts_indices = list() # HDDM if detector == "HDDM": hddm_w = HDDM_W() for i in range(len(overall_dataframe[feature])): hddm_w.add_element(float(overall_dataframe[feature][i])) if hddm_w.detected_change() and i >= window: detected_drifts_indices.append(i - window) # Page Hinkley if detector == "Page Hinkley": ph = PageHinkley() for i in range(len(overall_dataframe[feature])): ph.add_element(float(overall_dataframe[feature][i])) if ph.detected_change() and i >= window: detected_drifts_indices.append(i - window) # ADWIN if detector == "ADWIN": adwin = ADWIN() for i in range(len(overall_dataframe[feature])): adwin.add_element(float(overall_dataframe[feature][i])) if adwin.detected_change() and i >= window: detected_drifts_indices.append(i - window) # Check for detected drifts if len(detected_drifts_indices) != 0: log("[INFO] Data drift detected in feature: " + feature) log("[INFO] The drifted rows are: " + str(detected_drifts_indices)) drifts[feature] = detected_drifts_indices if drift_notification: log("[INFO] Sending a web notification", token) message = "MaaS data drift detected from " + get_token_user( token) + " (" + token + ")" if submit_web_notification(message, token): log("[INFO] Web notification sent!") else: log("[ERROR] Error occurred while sending a web notification" ) return json.dumps(drifts, cls=NpEncoder)
def test_page_hinkley(test_path): """ ADWIN drift detection test. The first half of the stream contains a sequence corresponding to a normal distribution of integers from 0 to 1. From index 999 to 1999 the sequence is a normal distribution of integers from 0 to 7. """ ph = PageHinkley() test_file = os.path.join(test_path, 'drift_stream.npy') data_stream = np.load(test_file) expected_indices = [1013, 1335, 1505, 1758] detected_indices = [] for i in range(data_stream.size): ph.add_element(data_stream[i]) if ph.detected_change(): detected_indices.append(i) assert detected_indices == expected_indices
def initialize_detectors(detector_type): #note PH test uses differenced raw data! [168,24] #note MK_diff test uses differenced raw data! [168,24] #Note: HDDDM_diff actually same as "HDDDM" but important to name differently for the retrain function which looks for the "diff" term in the name detectors_dict = {'HDDDM': HDDDM(3*4*168, gamma=1.5), 'HDDDM_diff': HDDDM(3*4*168, gamma=1.5), 'STEPD': STEPD(3*4*168), 'MK': MannKendall(min_instances = 3*4*168, instances_step = 168, test_type = 'seasonal', alpha=0.01, period = 52, slope_threshold = 0.05), 'MK_diff': MannKendall(min_instances = 3*4*168, instances_step = 168, test_type = 'original_mk', alpha=0.05, slope_threshold = 0.00), 'ADWIN': ADWIN(delta=0.0007), 'PH': PageHinkley(min_instances = 3*4*168, threshold = 700, delta = 900), 'PH_diff': PageHinkley(min_instances = 3*4*168, threshold = 1200, delta = 1000) } return detectors_dict[detector_type]
def skmultiflow_detector(drift_detector_type: str) -> BaseDriftDetector: if drift_detector_type == "SKMULTIFLOW_EDDM": multiflow_detector = EDDM() elif drift_detector_type == "SKMULTIFLOW_PageHinkley": multiflow_detector = PageHinkley() elif drift_detector_type == "SKMULTIFLOW_DDM": multiflow_detector = DDM() elif drift_detector_type == "SKMULTIFLOW_ADWIN": multiflow_detector = ADWIN() else: raise Exception("Drift detector %s not implemented" % drift_detector_type) return multiflow_detector
ph_param1 = [25, 50, 75] ph_param2 = [0.005, 0.01, 0.02] knn = KNNClassifier() stream = driftStreams[0] for i in range(0, 3): trainX, trainY = stream.next_sample(2000) knn.partial_fit(trainX, trainY) adwin = ADWIN(delta=adwin_param[i]) ddm = DDM(out_control_level=ddm_param[i]) kswin1 = KSWIN(window_size=ks_param1[i]) # kswin2 = KSWIN(stat_size=ks_param2[i]) ph1 = PageHinkley(threshold=ph_param1[i]) ph2 = PageHinkley(delta=ph_param2[i]) adwin_results = [] ddm_results = [] kswin1_results = [] kswin2_results = [] ph1_results = [] ph2_results = [] n_samples = 0 corrects = 0 coldstartData = [] while n_samples < 2000: X, y = stream.next_sample()
return stream def drift_flow(stream, method, name, beginning_stream, end_tables): detected_change = [] detected_warning = [] number_of_changes = 0 for i in range(len(stream)): method.add_element(stream[i]) if method.detected_warning_zone(): print(f'Warning zone has been detected in data: {stream[i]} - of index: {i}') detected_warning.append((stream[i])) if method.detected_change(): detected_change.append(stream[i]) print(f'Change has been detected in data: {stream[i]} - of index: {i}') number_of_changes += 1 else: detected_change.append(None) print(f'{name} Detected changes: {number_of_changes}') print(f'{name} Detected warning zones: {str(len(detected_warning))}') plots(stream, detected_change, name, beginning_stream, end_tables) stream = make_stream(PATH) drift_flow(stream, EDDM(), 'EDDM', 0, 500) drift_flow(stream, HDDM_A(), 'HDDM_A', 0, 500) drift_flow(stream, HDDM_W(), 'HDDM_W', 0, 500) drift_flow(stream, PageHinkley(), 'PH', 0, 500) drift_flow(stream, DDM(), 'DDM', 0, 500)
elif DETECTOR == "HDDM_A": print ("HDDM_A") nama_model = nama_model+DETECTOR detector = HDDM_A() elif DETECTOR == "HDDM_W": print ("HDDM_W") nama_model = nama_model+DETECTOR detector = HDDM_W() elif DETECTOR == "KSWIN": print ("KSWIN") nama_model = nama_model+DETECTOR detector = KSWIN() elif DETECTOR == "PageHinkley": print ("PageHinkley") nama_model = nama_model+DETECTOR detector = PageHinkley() elif DETECTOR =="KD3": nama_model = nama_model+DETECTOR detector= KD3(window_size=args.window_size, accumulative_threshold=args.p2, detection_threshold=args.p1,bandwidth=0.75) else: detector=None labels = train_dataset['label'].unique().tolist() mapping = dict( zip(labels,range(len(labels))) ) train_dataset.replace({'label': mapping},inplace=True) ds = args.dataset ds = ds.replace("final_800_", "") ds = ds.replace(".pickle", "")
def test_on_data_set(data_desc, D): r = {data_desc: {"HDDDM": [], "SWIDD": [], "EDDM": [], "DDM": [], "ADWIN": [], "PageHinkley": []}} training_buffer_size = 100 # Size of training buffer of the drift detector n_train = 200 # Initial training set size concept_drifts = D["drifts"] X, Y = D["data"] data_stream = np.concatenate((X, Y.reshape(-1, 1)), axis=1) X0, Y0 = X[0:n_train, :], Y[0:n_train, :] # Training dataset data0 = data_stream[0:n_train,:] X_next, Y_next = X[n_train:, :], Y[n_train:, :] # Test set data_next = data_stream[n_train:,:] # Run unsupervised drift detector dd = DriftDetectorUnsupervised(HDDDM(data0, gamma=None, alpha=0.005), batch_size=50) changes_detected = dd.apply_to_stream(data_next) # Evaluation scores = evaluate(concept_drifts, changes_detected) r[data_desc]["HDDDM"].append(scores) dd = DriftDetectorUnsupervised(SWIDD(max_window_size=300, min_window_size=100), batch_size=1) changes_detected = dd.apply_to_stream(data_next) # Evaluation scores = evaluate(concept_drifts, changes_detected) r[data_desc]["SWIDD"].append(scores) # Run supervised drift detector model = GaussianNB() # EDDM drift_detector = EDDM() clf = Classifier(model) clf.flip_score = True clf.fit(X0, Y0.ravel()) dd = DriftDetectorSupervised(clf=clf, drift_detector=drift_detector, training_buffer_size=training_buffer_size) changes_detected = dd.apply_to_stream(X_next, Y_next) # Evaluation scores = evaluate(concept_drifts, changes_detected) r[data_desc]["EDDM"].append(scores) # DDM drift_detector = DDM(min_num_instances=30, warning_level=2.0, out_control_level=3.0) clf = Classifier(model) clf.flip_score = True clf.fit(X0, Y0.ravel()) dd = DriftDetectorSupervised(clf=clf, drift_detector=drift_detector, training_buffer_size=training_buffer_size) changes_detected = dd.apply_to_stream(X_next, Y_next) # Evaluation scores = evaluate(concept_drifts, changes_detected) r[data_desc]["DDM"].append(scores) # ADWIN drift_detector = ADWIN(delta=2.) clf = Classifier(model) clf.fit(X0, Y0.ravel()) dd = DriftDetectorSupervised(clf=clf, drift_detector=drift_detector, training_buffer_size=training_buffer_size) changes_detected = dd.apply_to_stream(X_next, Y_next) # Evaluation scores = evaluate(concept_drifts, changes_detected) r[data_desc]["ADWIN"].append(scores) # PageHinkley drift_detector = PageHinkley() clf = Classifier(model) clf.flip_score = True clf.fit(X0, Y0.ravel()) dd = DriftDetectorSupervised(clf=clf, drift_detector=drift_detector, training_buffer_size=training_buffer_size) changes_detected = dd.apply_to_stream(X_next, Y_next) # Evaluation scores = evaluate(concept_drifts, changes_detected) r[data_desc]["PageHinkley"].append(scores) return r
# Stream elements are added to DDM and checking whether drift occured for j in range(1000): HW.add_element(stream[j]) if HW.detected_change(): print('Concept drift detected in data: ' + str(stream[j]) + ' - at index: ' + str(j)) if HW.detected_warning_zone(): print('Warning detected in data: ' + str(stream[j]) + ' - at index: ' + str(j)) # page hinkley test import numpy as np from skmultiflow.drift_detection import PageHinkley # Initialize the PageHinkley object ph = PageHinkley() # set seed for reproducibility np.random.seed(123) # Simulate a data stream of size 1000 from a normal distribution # with mean=0 and standard deviation=0.1 stream = np.random.normal(0, 0.1, 1000) # Data concept are changed from index 299 to 799 for j in range(299, 800): stream[j] = np.random.randint(5, high=9) # Adding stream elements to the PageHinkley drift detector and verifying if drift occurred for j in range(1000): ph.add_element(stream[j])