def test_bayesian_ruleset(): from common.gen_samples import read_anomaly_dataset x, y = read_anomaly_dataset("toy2") y = np.asarray(y, dtype=np.int32) meta = get_feature_meta_default(x, y) # print(meta) compact_rules = [ # These were precomputed; found after 30 feedback iterations with AAD. # Each rule corresponds to the anomaly class (label = 1) "(F1 > -0.431709) & (F1 <= 2.033541) & (F2 > 3.703597)", "(F1 > 4.752354) & (F1 <= 6.210754) & (F2 > 1.581015) & (F2 <= 3.592983)", "(F1 > 6.298735) & (F2 > -0.822048) & (F2 <= 3.740281)", ] rules = convert_strings_to_conjunctive_rules(compact_rules, meta) print("Candidate rules:") print(" %s" % "\n ".join([str(rule) for rule in rules])) # sanity_check_bayesian_ruleset(x, y, rules, meta) br = BayesianRuleset(meta=meta, opts=None, maxlen=get_max_len_in_rules(rules), max_iter=200, n_min_support_stop=20) br.fit(x, y, rules) print("Selected rules:") for idx in br.predicted_rules: print(" rule %d: %s" % (idx, str(br.rules[idx])))
def test_kl_data_drift_classifier(): logger = logging.getLogger(__name__) args = get_command_args(debug=False) configure_logger(args) dataset_config = dataset_configs[args.dataset] stream_window = dataset_config[2] alpha = 0.05 n_trees = 100 X_full, y_full = read_anomaly_dataset(args.dataset) logger.debug("dataset: %s (%d, %d), stream_window: %d, alpha: %0.3f" % (args.dataset, X_full.shape[0], X_full.shape[1], stream_window, alpha)) stream = DataStream(X_full, y_full, IdServer(initial=0)) # get first window of data training_set = stream.read_next_from_stream(stream_window) x, y, ids = training_set.x, training_set.y, training_set.ids logger.debug("First window loaded (%s): %d" % (args.dataset, x.shape[0])) # train classifier with the window of data clf = RF(n_estimators=n_trees) clf.fit(x, y) logger.debug("Random Forest classifier created with %d trees" % clf.n_estimators) # prepare wrapper over the classifier which will compute KL-divergences # NOTE: rf.clf is the scikit-learn Random Forest classifier instance model = RandomForestAadWrapper(x=x, y=y, clf=clf) logger.debug("Wrapper model created with %d nodes" % len(model.w)) # compute KL replacement threshold *without* p ref_kls, kl_q_alpha = model.get_KL_divergence_distribution(x, p=None, alpha=alpha) # now initialize reference p p = model.get_node_sample_distributions(x) window = 0 while not stream.empty(): window += 1 # get next window of data and check KL-divergence training_set = stream.read_next_from_stream(n=stream_window) x, y = training_set.x, training_set.y logger.debug("window %d loaded: %d" % (window, x.shape[0])) # compare KL-divergence of current data dist against reference dist p comp_kls, _ = model.get_KL_divergence_distribution(x, p=p) # find which trees exceed alpha-level threshold trees_exceeding_kl_q_alpha = model.get_trees_to_replace(comp_kls, kl_q_alpha) n_threshold = int(2 * alpha * n_trees) logger.debug("[%d] #trees_exceeding_kl_q_alpha: %d, threshold number of trees: %d\n%s" % (window, len(trees_exceeding_kl_q_alpha), n_threshold, str(list(trees_exceeding_kl_q_alpha))))
def test_kl_data_drift(): logger = logging.getLogger(__name__) args = get_command_args(debug=False, debug_args=[ "--debug", "--plot", "--log_file=temp/test_concept_drift.log" ]) configure_logger(args) np.random.seed(42) dataset_config = dataset_configs[args.dataset] stream_window = dataset_config[2] alpha = 0.05 X_full, y_full = read_anomaly_dataset(args.dataset) logger.debug( "dataset: %s (%d, %d), stream_window: %d, alpha: %0.3f" % (args.dataset, X_full.shape[0], X_full.shape[1], stream_window, alpha)) stream = DataStream(X_full, y_full, IdServer(initial=0)) training_set = stream.read_next_from_stream(stream_window) x, y, ids = training_set.x, training_set.y, training_set.ids model = get_iforest_model(x) all_kl_q_alpha = list() all_reference_kls = list() all_compare_kls = list() trees_replaced = list() # compute KL replacement threshold *without* p ref_kls, kl_q_alpha = model.get_KL_divergence_distribution(x, p=None, alpha=alpha) # now initialize reference p p = model.get_node_sample_distributions(x) max_kl = np.max(ref_kls) window = 0 # already read the first window while True: buffer = stream.read_next_from_stream(stream_window) if buffer is None: break window += 1 x, y, ids = buffer.x, buffer.y, buffer.ids # logger.debug("#new: %d" % x.shape[0]) model.add_samples(X=x) all_kl_q_alpha.append(kl_q_alpha) all_reference_kls.append(ref_kls) # compare KL-divergence of current data dist against reference dist p comp_kls, _ = model.get_KL_divergence_distribution(x, p=p) all_compare_kls.append(comp_kls) max_kl = max(max_kl, np.max(comp_kls)) # find which trees exceed alpha-level threshold replace_trees_by_kl = model.get_trees_to_replace(comp_kls, kl_q_alpha) n_trees = model.clf.n_estimators n_replace = 0 if replace_trees_by_kl is None else len( replace_trees_by_kl) n_threshold = int(2 * alpha * n_trees) # we will replace if 2*alpha number of trees exceed the alpha-threshold do_replace = n_trees > 0 and n_replace >= n_threshold logger.debug( "window %d: n_replace: %d, threshold num: %d, do_replace: %s" % (window, n_replace, n_threshold, str(do_replace))) if do_replace: if False: logger.debug("window %d: #replace_trees_by_kl: %d\n%s" % (window, len(replace_trees_by_kl), str(list(replace_trees_by_kl)))) trees_replaced.append(len(replace_trees_by_kl)) model.update_model_from_stream_buffer( replace_trees=replace_trees_by_kl) # recompute KL replacement threshold *without* p ref_kls, kl_q_alpha = model.get_KL_divergence_distribution( x, p=None, alpha=alpha) max_kl = max(max_kl, np.max(ref_kls)) # now recompute reference p p = model.get_node_sample_distributions(x) else: if False: logger.debug( "window %d: model not updated; replace_trees_by_kl: %s" % (window, str(list(replace_trees_by_kl)) if replace_trees_by_kl is not None else None)) trees_replaced.append(0) if args.plot: xlim = [0, window + 1] ylim = [0, max_kl + 3] dp = DataPlotter(pdfpath="./temp/test_concept_drift_%s.pdf" % args.dataset, rows=1, cols=1) pl = dp.get_next_plot() plt.xlim(xlim) plt.ylim(ylim) plt.xlabel('window') plt.ylabel('KL-divergence') for i in range(window): ref_label = com_label = threshold_label = replaced_label = None ref_kls = all_reference_kls[i] com_kls = all_compare_kls[i] mkl = max(np.max(ref_kls), np.max(com_kls)) x_coord = i + 1 replaced_y_coord = mkl + 2 if i == 0: ref_label = "ref. KL dist" com_label = "KL-dist w.r.t ref. dist" threshold_label = "%0.2f-alpha KL" % alpha replaced_label = "(.) - number of trees replaced" pl.scatter([x_coord], [replaced_y_coord], color="black", marker=".", s=0, label=replaced_label) pl.scatter(np.ones(len(ref_kls), dtype=np.float32) * x_coord, ref_kls, color="orange", marker="*", s=8, label=ref_label) pl.scatter([x_coord], [all_kl_q_alpha[i]], color="red", marker="+", s=30, label=threshold_label) pl.scatter(np.ones(len(ref_kls), dtype=np.float32) * x_coord + 0.1, com_kls, color="green", marker="*", s=8, label=com_label) pl.text(x_coord - 0.2, replaced_y_coord, "(%d)" % trees_replaced[i], fontsize=8, label=replaced_label) pl.legend(loc='upper left', prop={'size': 6}) dp.close()