def test_naive_bayes(test_path): stream = SEAGenerator(random_state=1) stream.prepare_for_use() learner = NaiveBayes() cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=stream.target_values) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'data_naive_bayes_proba.npy') y_proba_expected = np.load(test_file) assert np.allclose(y_proba, y_proba_expected) expected_info = 'NaiveBayes: nominal attributes: [] - ' assert learner.get_info() == expected_info learner.reset() learner.fit(X=np.array(X_batch[:4500]), y=np.array(y_batch[:4500])) expected_score = 0.9378757515030061 assert np.isclose( expected_score, learner.score(X=np.array(X_batch[4501:]), y=np.array(y_batch[4501:]))) assert 'estimator' == learner.get_class_type() assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_clone(): stream = SEAGenerator(random_state=1) learner = NaiveBayes() cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=[0, 1]) cnt += 1 cloned = clone(learner) assert learner._observed_class_distribution != {} and cloned._observed_class_distribution == {}
class Bayes(IncrementalClassifier): def __init__(self): super().__init__() self.clf = NaiveBayes() def partial_fit(self, one_row): self.clf.partial_fit([one_row[0]], [one_row[1]]) def predict(self, x): return self.clf.predict(x)
class LDDDSDA(BaseDistributionDetector): def __init__(self, batch_size=100, train_size=100, rho=0.1, alpha=0.05, base_learner=NaiveBayes()): super().__init__() self.w = batch_size self.l = base_learner self.n = train_size self.alpha = alpha self.rho = rho self.trained = False self.d_train_X, self.d_train_y = [], [] self.d_buffer_X, self.d_buffer_y = [], [] self.reset() def reset(self): super().reset() def add_element(self, X, y): if self.in_concept_change: self.reset() X, y = np.asarray(X), np.asarray(y) # if X.ndim != 1 or y.ndim != 1: # raise ValueError("input_value should has one dimension") if (not self.trained) and len(self.d_train_X) < self.n: self.d_train_X.append(X) self.d_train_y.append(y) if len(self.d_train_X) == self.n: self.l.partial_fit(np.asarray(self.d_train_X), np.asarray(self.d_train_y)) self.trained = True return if len(self.d_train_X) < self.w: self.d_train_X.append(X) self.d_train_y.append(y) return self.d_buffer_X.append(X) self.d_buffer_y.append(y) if len(self.d_buffer_X) < self.w: return self.d_train_X, self.d_train_y = self.ldd_dis(np.asarray(self.d_train_X), np.asarray(self.d_train_y), np.asarray(self.d_buffer_X), np.asarray(self.d_buffer_y)) self.l = NaiveBayes() self.l.fit(self.d_train_X, self.d_train_y) self.d_train_X = self.d_train_X.tolist() self.d_train_y = self.d_train_y.tolist() print(len(self.d_train_X)) self.d_buffer_X = [] self.d_buffer_y = [] return def predict(self, X): return self.l.predict(X) def ldd_dis(self, d1_X, d1_y, d2_X, d2_y): d = np.append(d1_X, d2_X, axis=0) d_y = np.append(d1_y, d2_y, axis=0) d1_dec, d1_sta, d1_inc = [], [], [] d2_dec, d2_sta, d2_inc = [], [], [] kdtree = KDTree(d) d_knn = [] for i in range(d.shape[0]): d_knn.append(set(kdtree.query(X=d[i:i+1], k=int(d.shape[0] * self.rho), return_distance=False)[0])) indexes = np.arange(d.shape[0]) np.random.shuffle(indexes) _d1 = set(indexes[:d1_X.shape[0]]) _d2 = set(indexes[d1_X.shape[0]:]) deltas = [] for i in range(d.shape[0]): x1 = len(d_knn[indexes[i]] & _d1) x2 = len(d_knn[indexes[i]] & _d2) if i < d1_X.shape[0]: deltas.append(x2 / x1 - 1) else: deltas.append(x1 / x2 - 1) delta_std = np.std(deltas, ddof=1) theta_dec = stats.norm.ppf(1 - self.alpha, 0, delta_std) theta_inc = stats.norm.ppf(self.alpha, 0, delta_std) _d1 = set(np.arange(d1_X.shape[0])) _d2 = set(np.arange(d1_X.shape[0], d.shape[0])) for i in range(d.shape[0]): x1 = len(d_knn[i] & _d1) x2 = len(d_knn[i] & _d2) if i < d1_X.shape[0]: delta = x2 / x1 - 1 if delta < theta_dec: d1_dec.append(i) elif delta > theta_inc: d1_inc.append(i) else: d1_sta.append(i) else: delta = x1 / x2 - 1 if delta < theta_dec: d2_dec.append(i) elif delta > theta_inc: d2_inc.append(i) else: d2_sta.append(i) if len(d1_dec) == 0 and len(d2_inc) == 0: return d1_X, d1_y self.in_concept_change = True aux = [] if len(d2_dec) != 0: aux.append(len(d1_inc) / len(d2_dec)) if len(d2_sta) != 0: aux.append(len(d1_sta) / len(d2_sta)) if len(d2_inc) != 0: aux.append(len(d1_dec) / len(d2_inc)) k = min(aux) d2_dec += d1_inc[:int(k * len(d2_dec))] d2_sta += d1_sta[:int(k * len(d2_sta))] d2_inc += d1_dec[:int(k * len(d2_inc))] aux_indexes = d2_inc + d2_sta + d2_dec r = self.w / len(aux_indexes) d2_dec = d2_dec[:int(len(d2_dec)*r)] d2_sta = d2_sta[:int(len(d2_sta)*r)] d2_inc = d1_inc[:int(len(d2_inc)*r)] aux_indexes = d2_inc + d2_sta + d2_dec return d[aux_indexes], d_y[aux_indexes]
def main(): overall_kswin_tp = overall_kswin_tn = overall_kswin_fp = overall_kswin_fn = 0 overall_adwin_tp = overall_adwin_tn = overall_adwin_fp = overall_adwin_fn = 0 # mebwin_drifts = [] overall_k_swmebwin_tp = overall_k_swmebwin_tn = overall_k_swmebwin_fp = overall_k_swmebwin_fn = 0 overall_swmebwin_tp = overall_swmebwin_tn = overall_swmebwin_fp = overall_swmebwin_fn = 0 overall_eddm_tp = overall_eddm_tn = overall_eddm_fp = overall_eddm_fn = 0 overall_ddm_tp = overall_ddm_tn = overall_ddm_fp = overall_ddm_fn = 0 for stream in streams: print(stream.name) f = open('drifts.txt', 'a+') f.write(f'**{stream.name}**\n\n') f.close() stream.prepare_for_use() stream.next_sample() # mebwin = MEBWIN(epsilon=0.1, sensitivity=0.98, w_size=100, stat_size=30) adwin = [] kswin = [] ddm = DDM(min_num_instances=30) eddm = EDDM() data = [] labels = [] predictions = [] kswin_drifts = [] adwin_drifts = [] # mebwin_drifts = [] k_swmebwin_drifts = [] swmebwin_drifts = [] eddm_drifts = [] ddm_drifts = [] swmebwin = SWMEBWIN(classes=stream.target_values, w_size=80, epsilon=0.05) # k_swmebwin = Kernel_SWMEBWIN(classes=stream.target_values, w_size=80, epsilon=0.05, gamma=10**10) k_swmebwin = Kernel_SWMEBWIN(classes=stream.target_values, w_size=80, epsilon=0.05) # gamma maybe 1.0 / stream.current_sample_x.shape[1] RANGE = 1000000 DIM = 50 # - 2 because first drift is at 2000 not 1000 and last drift is not detectable # COUNT_DRIFTS = RANGE / 1000 - 2 n_rand_dims = DIM - stream.current_sample_x.size multiply = n_rand_dims // stream.current_sample_x.size # partial fit -> pretrain for _m in range(multiply): current_sample_x = np.array([[]]) current_sample_x = np.concatenate( (current_sample_x, stream.current_sample_x), axis=1) bayes = NaiveBayes() bayes.partial_fit(np.array(current_sample_x), list(stream.current_sample_y.ravel())) for j in range(DIM): adwin.append(ADWIN(delta=0.002)) kswin.append(KSWIN(w_size=300, stat_size=30, alpha=0.0001)) """Add dims""" for i in range(RANGE): current_sample_x = np.array([[]]) for _m in range(multiply): current_sample_x = np.concatenate( (current_sample_x, stream.current_sample_x), axis=1) data.append(current_sample_x.ravel()) labels.append(stream.current_sample_y.ravel()[0]) predictions.append(0 if bayes.predict(current_sample_x) == labels[i] else 1) bayes.partial_fit(current_sample_x, list(stream.current_sample_y.ravel())) stream.next_sample() # MEBWIN # start = time.time() # for i in range(RANGE): # mebwin.add_element(data[i]) # # if mebwin.change_detected is True: # mebwin_drifts.append(i) # # f = open('drifts.txt', 'a+') # f.write(f'MEBWIN detected {len(mebwin_drifts)} drifts in {time.time() - start} {mebwin_drifts}\n\n') # f.close() # print(f'MEBWIN took {time.time() - start} sec and detected {len(mebwin_drifts)} drifts') # Kernel SWMEBWIN start = time.time() for i in range(RANGE): k_swmebwin.add_element(value=data[i], label=labels[i]) if k_swmebwin.change_detected is True: k_swmebwin_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(k_swmebwin_drifts, RANGE) overall_k_swmebwin_tp += tp overall_k_swmebwin_tn += tn overall_k_swmebwin_fp += fp overall_k_swmebwin_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'K-SWMEB detected {len(k_swmebwin_drifts)} drifts in {time.time() - start} {k_swmebwin_drifts}\n\n') f.close() print(f'K-SW-MEBWIN took {end} sec and detected {len(k_swmebwin_drifts)} drifts\n') # SWMEBWIN start = time.time() for i in range(RANGE): swmebwin.add_element(value=data[i], label=labels[i]) if swmebwin.change_detected is True: swmebwin_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(swmebwin_drifts, RANGE) overall_swmebwin_tp += tp overall_swmebwin_tn += tn overall_swmebwin_fp += fp overall_swmebwin_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'SWMEB detected {len(swmebwin_drifts)} drifts in {time.time() - start} {swmebwin_drifts}\n\n') f.close() print(f'SW-MEBWIN took {end} sec and detected {len(swmebwin_drifts)} drifts\n') # ADWIN start = time.time() for i in range(RANGE): adwin_detected = False for j in range(data[i].size): adwin[j].add_element(data[i][j]) if adwin[j].detected_change(): adwin_detected = True if adwin_detected is True: adwin_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(adwin_drifts, RANGE) overall_adwin_tp += tp overall_adwin_tn += tn overall_adwin_fp += fp overall_adwin_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'ADWIN detected {len(adwin_drifts)} drifts in {time.time() - start} at {adwin_drifts}\n\n') f.close() print(f'ADWIN took {end} sec and detected {len(adwin_drifts)} drifts\n') # KSWIN start = time.time() for i in range(RANGE): kswin_detected = False for j in range(data[i].size): kswin[j].add_element(data[i][j]) if kswin[j].detected_change(): kswin_detected = True if kswin_detected is True: kswin_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(kswin_drifts, RANGE) overall_kswin_tp += tp overall_kswin_tn += tn overall_kswin_fp += fp overall_kswin_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'KSWIN detected {len(kswin_drifts)} drifts in {time.time() - start} at {kswin_drifts}\n\n') f.close() print(f'KSWIN took {end} sec and detected {len(kswin_drifts)} drifts\n') # EDDM start = time.time() for i in range(RANGE): eddm_detected = False eddm.add_element(predictions[i]) if eddm.detected_change(): eddm_detected = True if eddm_detected is True: eddm_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(eddm_drifts, RANGE) overall_eddm_tp += tp overall_eddm_tn += tn overall_eddm_fp += fp overall_eddm_fn += fn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'EDDM detected {len(eddm_drifts)} drifts in {time.time() - start} at {eddm_drifts}\n\n') f.close() print(f'EDDM took {end} sec and detected {len(eddm_drifts)} drifts\n') # DDM start = time.time() for i in range(RANGE): ddm_detected = False ddm.add_element(predictions[i]) if ddm.detected_change(): ddm_detected = True if ddm_detected is True: ddm_drifts.append(i) end = time.time() - start f1, tp, fp, tn, fn = confusion_matrix_stats(ddm_drifts, RANGE) overall_ddm_tp += tp overall_ddm_tn += tn overall_ddm_fp += fp overall_ddm_fn += tn print(f'F1-Score: {f1}') print(f'{tp} true positives, {fp} false positives') print(f'{tn} true negatives, {fn} false negatives') f = open('drifts.txt', 'a+') f.write(f'DDM detected {len(ddm_drifts)} drifts in {time.time() - start} at {ddm_drifts}\n\n') f.close() print(f'DDM took {end} sec and detected {len(ddm_drifts)} drifts\n') # OVERALL STATISTICS print(50 * '-') print('K-SWMEBWIN\n') print(f'Overall F1: {calc_f1(overall_k_swmebwin_tp, overall_k_swmebwin_fp, overall_k_swmebwin_tn, overall_k_swmebwin_fn)}') print(f'{overall_k_swmebwin_tp} true positives, {overall_k_swmebwin_fp} false positives') print(f'{overall_k_swmebwin_tn} true negatives, {overall_k_swmebwin_fn} false negatives') print(50* '-') print(50 * '-') print('SWMEBWIN\n') print(f'Overall F1: {calc_f1(overall_swmebwin_tp, overall_swmebwin_fp, overall_swmebwin_tn, overall_swmebwin_fn)}') print(f'{overall_swmebwin_tp} true positives, {overall_swmebwin_fp} false positives') print(f'{overall_swmebwin_tn} true negatives, {overall_swmebwin_fn} false negatives') print(50* '-') print(50 * '-') print('KSWIN\n') print(f'Overall F1: {calc_f1(overall_kswin_tp, overall_kswin_fp, overall_kswin_tn, overall_kswin_fn)}') print(f'{overall_kswin_tp} true positives, {overall_kswin_fp} false positives') print(f'{overall_kswin_tn} true negatives, {overall_kswin_fn} false negatives') print(50* '-') print(50 * '-') print('ADWIN\n') print(f'Overall F1: {calc_f1(overall_adwin_tp, overall_adwin_fp, overall_adwin_tn, overall_adwin_fn)}') print(f'{overall_adwin_tp} true positives, {overall_adwin_fp} false positives') print(f'{overall_adwin_tn} true negatives, {overall_adwin_fn} false negatives') print(50* '-') print(50 * '-') print('DDM\n') print(f'Overall F1: {calc_f1(overall_ddm_tp, overall_ddm_fp, overall_ddm_tn, overall_ddm_fn)}') print(f'{overall_ddm_tp} true positives, {overall_ddm_fp} false positives') print(f'{overall_ddm_tn} true negatives, {overall_ddm_fn} false negatives') print(50* '-') print(50 * '-') print('EDDM\n') print(f'Overall F1: {calc_f1(overall_eddm_tp, overall_eddm_fp, overall_eddm_tn, overall_eddm_fn)}') print(f'{overall_eddm_tp} true positives, {overall_eddm_fp} false positives') print(f'{overall_eddm_tn} true negatives, {overall_eddm_fn} false negatives') print(50* '-')