def test_ddm(): """ DDM drift detection test. The first half of the data contains a sequence corresponding to a normal distribution with mean 0 and sigma 0.1. The second half corresponds to a normal distribution with mean 0.5 and sigma 0.1. """ ddm = DDM() # Data np.random.seed(1) mu, sigma = 0, 0.1 # mean and standard deviation d_1 = np.random.normal(mu, sigma, 1000) > 0 mu, sigma = 0.5, 0.1 # mean and standard deviation d_2 = np.random.normal(mu, sigma, 1000) > 0 data_stream = np.concatenate((d_1.astype(int), d_2.astype(int))) expected_indices = [103, 1060] detected_indices = [] for i in range(data_stream.size): ddm.add_element(data_stream[i]) if ddm.detected_change(): detected_indices.append(i) assert detected_indices == expected_indices expected_info = "DDM(min_num_instances=None, out_control_level=3.0, warning_level=2.0)" assert ddm.get_info() == expected_info
def sim_ddm(input_stream, start_point=0): ddm = DDM() change_point = [] detected_warning = [] for i in range(len(input_stream)): ddm.add_element(input_stream[i]) if ddm.detected_warning_zone(): detected_warning.append(i + start_point) if ddm.detected_change(): # plt.axvline(i, color='r', linestyle='dashed') change_point.append(i + start_point) # print('Change detected in data: ' + str(input_stream[i]) + ' - at index: ' + str(i)+'\n\n') return detected_warning, change_point
def ddm_test(): ddm = DDM() true_occur_position = 4443 data_stream = np.load("data/stream_acc.npy") for i in tqdm(range(data_stream.shape[0])): # print(data_stream[i]) # print(i) ddm.add_element(data_stream[i]) if ddm.detected_warning_zone(): print('Warning zone has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i)) if ddm.detected_change(): print('Change has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i))
def test_ddm(test_path): """ DDM drift detection test. The first half of the stream contains a sequence corresponding to a normal distribution of integers from 0 to 1. From index 999 to 1999 the sequence is a normal distribution of integers from 0 to 7. """ ddm = DDM() test_file = os.path.join(test_path, 'drift_stream.npy') data_stream = np.load(test_file) expected_indices = [1009] detected_indices = [] for i in range(data_stream.size): ddm.add_element(data_stream[i]) if ddm.detected_change(): detected_indices.append(i) assert detected_indices == expected_indices
def run(self): ''' main method to simulate new experiment ''' print(f"Starting Experiment:{self}") try: start_window_size = self.window_size num_of_correct_predictions, predictions_counter = 0, 0 ddm = DDM() for record in range(self.X.shape[0]): x_record, y_record = np.array([self.X[record, :]]), np.ravel(np.array([self.y[record]])) if record < self.window_size: # aggregate records till window size continue elif record == self.window_size: # first initialization try: self.init_ofs_ol(record) except Exception as e: # case where ofs failed to find features - try to add more records and replay process if self.window_size > start_window_size * 4: raise Exception("OFS could not find features.") self.window_size += 50 logging.info(f"Changed window size from {self.window_size - 50} to {self.window_size}") continue # predict my_pred = self.ol.created_model.predict( x_record) if self.ofs is None else self.ol.created_model.predict( x_record[:, self.current_selected_features]) predictions_counter += 1 if y_record[0] == my_pred[0]: num_of_correct_predictions += 1 ddm.add_element(num_of_correct_predictions / predictions_counter) # add result to concept drift model self.prequential_accuracy.append(num_of_correct_predictions / predictions_counter) # add accuracy self.memory_usage.append(psutil.Process(os.getpid()).memory_info().rss) # add memory usage if self.ol.lazy: # partial fit for lazy models self.fit_lazy(x_record, y_record) if ddm.detected_change(): # check for concept drift self.concept_drift_detection(start_window_size, record) elif record != self.X.shape[0] - 1 and self.ofs: self.selected_features.append(self.selected_features[-1]) except Experiment as e: logging.error(f"Error: {str(e)}")
print(corrects, n_samples) while n_samples < 20000: driftDataX, driftDataY = stream.next_sample() my_pred = knn.predict(driftDataX) correct = driftDataY[0] == my_pred[0] if correct: corrects += 1 n_samples += 1 adwin.add_element(0 if correct else 1) if adwin.detected_change(): # print('ADWIN', n_samples) adwin_results.append(n_samples) ddm.add_element(0 if correct else 1) if ddm.detected_change(): # print('DDM', n_samples) ddm_results.append(n_samples) ph1.add_element(0 if correct else 1) if ph1.detected_change(): # print('PH', n_samples) ph1_results.append(n_samples) ph2.add_element(0 if correct else 1) if ph2.detected_change(): # print('PH', n_samples) ph2_results.append(n_samples) kswin1.add_element(corrects / n_samples)
mine_threshold = [] pred_grace_ht = [] pred_grace_ht_p = [] ht_p = None ML_accuracy = 0 ddm = DDM() h = hpy() while elec_stream.has_more_samples(): n_global += 1 X_test, y_test = elec_stream.next_sample() y_predict = ht.predict(X_test) ddm_start_time = time.time() ddm.add_element(y_test != y_predict) ML_accuracy += 1 if y_test == y_predict else 0 ddm_running_time = time.time() - ddm_start_time RT_ddm.append(ddm_running_time) if (n_global > grace_end): if (n_global > detect_end): if ht_p is not None: drift_point = detect_end - 2 * grace print("Accuracy of ht: " + str(np.mean(pred_grace_ht))) print("Accuracy of ht_p: " + str(np.mean(pred_grace_ht_p))) if (np.mean(pred_grace_ht_p) > np.mean(pred_grace_ht)): print("TP detected at: " + str(drift_point)) TP_ddm.append(drift_point) ht = ht_p else: print("FP detected at: " + str(drift_point))
adwin = ADWIN() eddm =EDDM() # Simulating a data stream as a normal distribution of 1's and 0's data_stream = np.random.randint(2, size=200) # Changing the data concept from index 999 to 1500, simulating an # increase in error rate for i in range(100, 150): data_stream[i] = np.random.randint(4, high=8) # Adding stream elements to DDM and verifying if drift occurred plt.plot(data_stream) fig= plt.gcf() fig.set_size_inches(10, 5.5) plt.ylabel('value') plt.xlabel('Time') for i in range(200): ddm.add_element(data_stream[i]) if ddm.detected_warning_zone(): plt.axvline(i, color='g', linestyle='--', linewidth=0.7) # print('Warning zone has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i)) if ddm.detected_change(): plt.axvline(i, color='r', linestyle='--', linewidth=0.7) # print('Change has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i)) plt.show() plt.plot(data_stream) fig= plt.gcf() fig.set_size_inches(10, 5.5) plt.ylabel('value') plt.xlabel('Time') for i in range(200): eddm.add_element(data_stream[i])
# Simulate a data stream of size 1000 from a Standard normal distribution stream = np.random.randn(1000) stream[:10] ## Output- #array([-1.0856306 , 0.99734545, 0.2829785 , -1.50629471, -0.57860025, # 1.65143654, -2.42667924, -0.42891263, 1.26593626, -0.8667404 ]) # Data concept are changed from index 299 to 600 for j in range(299, 600): stream[j] = np.random.randint(5, high=9) # Stream elements are added to DDM and checking whether drift occured for j in range(1000): d2m.add_element(stream[j]) if d2m.detected_change(): print('Concept drift detected in data: ' + str(stream[j]) + ' - at index: ' + str(j)) if d2m.detected_warning_zone(): print('Warning detected in data: ' + str(stream[j]) + ' - at index: ' + str(j)) ### Output: #Concept drift detected in data: 1.0693159694243486 - at index: 55 #Concept drift detected in data: 2.0871133595881854 - at index: 88 #Concept drift detected in data: 0.8123413299768204 - at index: 126 #Warning detected in data: 1.3772574828673068 - at index: 158 #Warning detected in data: -0.1431759743261871 - at index: 159 #Warning detected in data: 0.02031599823462459 - at index: 160 #Warning detected in data: -0.19396387055266243 - at index: 161
def main(): overall_kswin_tp = overall_kswin_tn = overall_kswin_fp = overall_kswin_fn = 0 overall_adwin_tp = overall_adwin_tn = overall_adwin_fp = overall_adwin_fn = 0 # mebwin_drifts = [] overall_k_swmebwin_tp = overall_k_swmebwin_tn = overall_k_swmebwin_fp = overall_k_swmebwin_fn = 0 overall_swmebwin_tp = overall_swmebwin_tn = overall_swmebwin_fp = overall_swmebwin_fn = 0 overall_eddm_tp = overall_eddm_tn = overall_eddm_fp = overall_eddm_fn = 0 overall_ddm_tp = overall_ddm_tn = overall_ddm_fp = overall_ddm_fn = 0 for stream in streams: print(stream.name) f = open('drifts.txt', 'a+') f.write(f'**{stream.name}**\n\n') f.close() stream.prepare_for_use() stream.next_sample() # mebwin = MEBWIN(epsilon=0.1, sensitivity=0.98, w_size=100, stat_size=30) adwin = [] kswin = [] ddm = DDM(min_num_instances=30) eddm = EDDM() data = [] labels = [] predictions = [] kswin_drifts = [] adwin_drifts = [] # mebwin_drifts = [] k_swmebwin_drifts = [] swmebwin_drifts = [] eddm_drifts = [] ddm_drifts = [] swmebwin = SWMEBWIN(classes=stream.target_values, w_size=80, epsilon=0.05) # k_swmebwin = Kernel_SWMEBWIN(classes=stream.target_values, w_size=80, epsilon=0.05, gamma=10**10) k_swmebwin = Kernel_SWMEBWIN(classes=stream.target_values, w_size=80, epsilon=0.05) # gamma maybe 1.0 / stream.current_sample_x.shape[1] RANGE = 1000000 DIM = 50 # - 2 because first drift is at 2000 not 1000 and last drift is not detectable # COUNT_DRIFTS = RANGE / 1000 - 2 n_rand_dims = DIM - stream.current_sample_x.size multiply = n_rand_dims // stream.current_sample_x.size # partial fit -> pretrain for _m in range(multiply): current_sample_x = np.array([[]]) current_sample_x = np.concatenate( (current_sample_x, stream.current_sample_x), axis=1) bayes = NaiveBayes() bayes.partial_fit(np.array(current_sample_x), list(stream.current_sample_y.ravel())) for j in range(DIM): adwin.append(ADWIN(delta=0.002)) kswin.append(KSWIN(w_size=300, stat_size=30, alpha=0.0001)) """Add dims""" for i in range(RANGE): current_sample_x = np.array([[]]) for _m in range(multiply): current_sample_x = np.concatenate( (current_sample_x, stream.current_sample_x), axis=1) data.append(current_sample_x.ravel()) labels.append(stream.current_sample_y.ravel()[0]) predictions.append(0 if bayes.predict(current_sample_x) == labels[i] else 1) bayes.partial_fit(current_sample_x, list(stream.current_sample_y.ravel())) stream.next_sample() # MEBWIN # start = time.time() # for i in range(RANGE): # mebwin.add_element(data[i]) # # if mebwin.change_detected is True: # mebwin_drifts.append(i) # # f = open('drifts.txt', 'a+') # f.write(f'MEBWIN detected {len(mebwin_drifts)} drifts in {time.time() - start} {mebwin_drifts}\n\n') # f.close() # print(f'MEBWIN took {time.time() - start} sec and detected {len(mebwin_drifts)} drifts') # Kernel SWMEBWIN start = time.time() for i in range(RANGE): k_swmebwin.add_element(value=data[i], label=labels[i]) if k_swmebwin.change_detected is True: k_swmebwin_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(k_swmebwin_drifts, RANGE) overall_k_swmebwin_tp += tp overall_k_swmebwin_tn += tn overall_k_swmebwin_fp += fp overall_k_swmebwin_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'K-SWMEB detected {len(k_swmebwin_drifts)} drifts in {time.time() - start} {k_swmebwin_drifts}\n\n') f.close() print(f'K-SW-MEBWIN took {end} sec and detected {len(k_swmebwin_drifts)} drifts\n') # SWMEBWIN start = time.time() for i in range(RANGE): swmebwin.add_element(value=data[i], label=labels[i]) if swmebwin.change_detected is True: swmebwin_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(swmebwin_drifts, RANGE) overall_swmebwin_tp += tp overall_swmebwin_tn += tn overall_swmebwin_fp += fp overall_swmebwin_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'SWMEB detected {len(swmebwin_drifts)} drifts in {time.time() - start} {swmebwin_drifts}\n\n') f.close() print(f'SW-MEBWIN took {end} sec and detected {len(swmebwin_drifts)} drifts\n') # ADWIN start = time.time() for i in range(RANGE): adwin_detected = False for j in range(data[i].size): adwin[j].add_element(data[i][j]) if adwin[j].detected_change(): adwin_detected = True if adwin_detected is True: adwin_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(adwin_drifts, RANGE) overall_adwin_tp += tp overall_adwin_tn += tn overall_adwin_fp += fp overall_adwin_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'ADWIN detected {len(adwin_drifts)} drifts in {time.time() - start} at {adwin_drifts}\n\n') f.close() print(f'ADWIN took {end} sec and detected {len(adwin_drifts)} drifts\n') # KSWIN start = time.time() for i in range(RANGE): kswin_detected = False for j in range(data[i].size): kswin[j].add_element(data[i][j]) if kswin[j].detected_change(): kswin_detected = True if kswin_detected is True: kswin_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(kswin_drifts, RANGE) overall_kswin_tp += tp overall_kswin_tn += tn overall_kswin_fp += fp overall_kswin_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'KSWIN detected {len(kswin_drifts)} drifts in {time.time() - start} at {kswin_drifts}\n\n') f.close() print(f'KSWIN took {end} sec and detected {len(kswin_drifts)} drifts\n') # EDDM start = time.time() for i in range(RANGE): eddm_detected = False eddm.add_element(predictions[i]) if eddm.detected_change(): eddm_detected = True if eddm_detected is True: eddm_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(eddm_drifts, RANGE) overall_eddm_tp += tp overall_eddm_tn += tn overall_eddm_fp += fp overall_eddm_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'EDDM detected {len(eddm_drifts)} drifts in {time.time() - start} at {eddm_drifts}\n\n') f.close() print(f'EDDM took {end} sec and detected {len(eddm_drifts)} drifts\n') # DDM start = time.time() for i in range(RANGE): ddm_detected = False ddm.add_element(predictions[i]) if ddm.detected_change(): ddm_detected = True if ddm_detected is True: ddm_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(ddm_drifts, RANGE) overall_ddm_tp += tp overall_ddm_tn += tn overall_ddm_fp += fp overall_ddm_fn += tn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'DDM detected {len(ddm_drifts)} drifts in {time.time() - start} at {ddm_drifts}\n\n') f.close() print(f'DDM took {end} sec and detected {len(ddm_drifts)} drifts\n') # OVERALL STATISTICS print(50 * '-') print('K-SWMEBWIN\n') print(f'Overall F1: {calc_f1(overall_k_swmebwin_tp, overall_k_swmebwin_fp, overall_k_swmebwin_tn, overall_k_swmebwin_fn)}') print(f'{overall_k_swmebwin_tp} true positives, {overall_k_swmebwin_fp} false positives') print(f'{overall_k_swmebwin_tn} true negatives, {overall_k_swmebwin_fn} false negatives') print(50* '-') print(50 * '-') print('SWMEBWIN\n') print(f'Overall F1: {calc_f1(overall_swmebwin_tp, overall_swmebwin_fp, overall_swmebwin_tn, overall_swmebwin_fn)}') print(f'{overall_swmebwin_tp} true positives, {overall_swmebwin_fp} false positives') print(f'{overall_swmebwin_tn} true negatives, {overall_swmebwin_fn} false negatives') print(50* '-') print(50 * '-') print('KSWIN\n') print(f'Overall F1: {calc_f1(overall_kswin_tp, overall_kswin_fp, overall_kswin_tn, overall_kswin_fn)}') print(f'{overall_kswin_tp} true positives, {overall_kswin_fp} false positives') print(f'{overall_kswin_tn} true negatives, {overall_kswin_fn} false negatives') print(50* '-') print(50 * '-') print('ADWIN\n') print(f'Overall F1: {calc_f1(overall_adwin_tp, overall_adwin_fp, overall_adwin_tn, overall_adwin_fn)}') print(f'{overall_adwin_tp} true positives, {overall_adwin_fp} false positives') print(f'{overall_adwin_tn} true negatives, {overall_adwin_fn} false negatives') print(50* '-') print(50 * '-') print('DDM\n') print(f'Overall F1: {calc_f1(overall_ddm_tp, overall_ddm_fp, overall_ddm_tn, overall_ddm_fn)}') print(f'{overall_ddm_tp} true positives, {overall_ddm_fp} false positives') print(f'{overall_ddm_tn} true negatives, {overall_ddm_fn} false negatives') print(50* '-') print(50 * '-') print('EDDM\n') print(f'Overall F1: {calc_f1(overall_eddm_tp, overall_eddm_fp, overall_eddm_tn, overall_eddm_fn)}') print(f'{overall_eddm_tp} true positives, {overall_eddm_fp} false positives') print(f'{overall_eddm_tn} true negatives, {overall_eddm_fn} false negatives') print(50* '-')