def plot_sidebar(hts, dash_xy, dash_wh, pl): pl.add_patch( plt.Rectangle(dash_xy, dash_wh[0], dash_wh[1], facecolor='white', edgecolor='black', alpha=1)) dash_pts = cbind( np.ones(len(hts), dtype=float) * (dash_wh[0] / 2) + dash_xy[0], hts * dash_wh[1] + dash_xy[1]) # print dash_pts pl.plot(dash_pts[:, 0], dash_pts[:, 1], 'ro', markersize=3, markerfacecolor='red')
def aad_learn_ensemble_weights_with_budget(self, ensemble, opts): if opts.budget == 0: return None x = ensemble.scores y = ensemble.labels n, m = x.shape bt = get_budget_topK(n, opts) metrics = get_alad_metrics_structure(opts.budget, opts) ha = [] hn = [] xis = [] qstate = Query.get_initial_query_state(opts.qtype, opts=opts, qrank=bt.topK, a=1., b=1., budget=bt.budget) metrics.all_weights = np.zeros(shape=(opts.budget, m)) w_unif_prior = self.get_uniform_weights(m) if self.w is None: self.w = w_unif_prior for i in range(bt.budget): starttime_iter = timer() # save the weights in each iteration for later analysis metrics.all_weights[i, :] = self.w metrics.queried = xis # xis keeps growing with each feedback iteration order_anom_idxs, anom_score = self.order_by_score(x, self.w) if False and y is not None and metrics is not None: # gather AUC metrics metrics.train_aucs[0, i] = fn_auc(cbind(y, -anom_score)) # gather Precision metrics prec = fn_precision(cbind(y, -anom_score), opts.precision_k) metrics.train_aprs[0, i] = prec[len(opts.precision_k) + 1] train_n_at_top = get_anomalies_at_top(-anom_score, y, opts.precision_k) for k in range(len(opts.precision_k)): metrics.train_precs[k][0, i] = prec[k] metrics.train_n_at_top[k][0, i] = train_n_at_top[k] xi_ = qstate.get_next_query(maxpos=n, ordered_indexes=order_anom_idxs, queried_items=xis, x=x, lbls=y, y=anom_score, w=self.w, hf=append(ha, hn), remaining_budget=opts.budget - i) # logger.debug("xi: %d" % (xi,)) xi = xi_[0] xis.append(xi) metrics.test_indexes.append(qstate.test_indexes) if opts.single_inst_feedback: # Forget the previous feedback instances and # use only the current feedback for weight updates ha = [] hn = [] if y[xi] == 1: ha.append(xi) else: hn.append(xi) qstate.update_query_state(rewarded=(y[xi] == 1)) if opts.batch: # Use the original (uniform) weights as prior # This is an experimental option ... w = self.w_unif_prior hf = order_anom_idxs[0:i] ha = hf[np.where(y[hf] == 1)[0]] hn = hf[np.where(y[hf] == 0)[0]] self.update_weights(x, y, ha=ha, hn=hn, opts=opts, w=self.w) if np.mod(i, 1) == 0: endtime_iter = timer() tdiff = difftime(endtime_iter, starttime_iter, units="secs") logger.debug( "Completed [%s] fid %d rerun %d feedback %d in %f sec(s)" % (opts.dataset, opts.fid, opts.runidx, i, tdiff)) return metrics
def main(): if False: # DEBUG args = prepare_forest_aad_debug_args() else: # PRODUCTION args = get_command_args(debug=False) # print "log file: %s" % args.log_file configure_logger(args) opts = Opts(args) # print opts.str_opts() logger.debug(opts.str_opts()) if not opts.streaming: raise ValueError("Only streaming supported") X_full, y_full = read_data(opts) # X_train = X_train[0:10, :] # labels = labels[0:10] logger.debug("loaded file: (%s) %s" % (str(X_full.shape), opts.datafile)) logger.debug("results dir: %s" % opts.resultsdir) all_num_seen = None all_num_seen_baseline = None all_window = None all_window_baseline = None aucs = np.zeros(0, dtype=float) opts.fid = 1 for runidx in opts.get_runidxs(): tm_run = Timer() opts.set_multi_run_options(opts.fid, runidx) stream = DataStream(X_full, y_full) X_train, y_train = stream.read_next_from_stream(opts.stream_window) # logger.debug("X_train:\n%s\nlabels:\n%s" % (str(X_train), str(list(labels)))) model = prepare_aad_model(X_train, y_train, opts) # initial model training sad = StreamingAnomalyDetector(stream, model, unlabeled_x=X_train, unlabeled_y=y_train, max_buffer=opts.stream_window, opts=opts) sad.init_query_state(opts) if False: # use for DEBUG only run_feedback(sad, 0, opts.budget, opts) print "This is experimental/demo code for streaming integration and will be application specific." + \ " Exiting after reading max %d instances from stream and iterating for %d feedback..." % \ (opts.stream_window, opts.budget) exit(0) all_scores = np.zeros(0) all_y = np.zeros(0, dtype=int) scores = sad.get_anomaly_scores(X_train) # auc = fn_auc(cbind(y_train, -scores)) all_scores = np.append(all_scores, scores) all_y = np.append(all_y, y_train) iter = 0 seen = np.zeros(0, dtype=int) seen_baseline = np.zeros(0, dtype=int) stream_window_tmp = np.zeros(0, dtype=int) stream_window_baseline = np.zeros(0, dtype=int) stop_iter = False while not stop_iter: iter += 1 tm = Timer() seen_, seen_baseline_, queried_, queried_baseline_ = run_feedback( sad, opts.min_feedback_per_window, opts.max_feedback_per_window, opts) seen = append(seen, seen_) seen_baseline = append(seen_baseline, seen_baseline_) stream_window_tmp = append(stream_window_tmp, np.ones(len(seen_)) * iter) stream_window_baseline = append( stream_window_baseline, np.ones(len(seen_baseline_)) * iter) # queried = append(queried, queried_) # queried_baseline = append(queried_baseline, queried_baseline_) # logger.debug("seen:\n%s;\nbaseline:\n%s" % (str(list(seen)), str(list(seen_baseline)))) x_eval, y_eval = sad.get_next_from_stream(sad.max_buffer) if x_eval is None or iter >= opts.max_windows: if iter >= opts.max_windows: logger.debug("Exceeded %d iters; exiting stream read..." % opts.max_windows) stop_iter = True else: scores = sad.get_anomaly_scores( x_eval) # compute scores before updating the model all_scores = np.append(all_scores, scores) all_y = np.append(all_y, y_eval) if opts.allow_stream_update: sad.update_model_from_buffer() sad.move_buffer_to_unlabeled() logger.debug( tm.message( "Stream window [%d]: algo [%d/%d]; baseline [%d/%d]: " % (iter, np.sum(seen), len(seen), np.sum(seen_baseline), len(seen_baseline)))) auc = fn_auc(cbind(all_y, -all_scores)) # logger.debug("AUC: %f" % auc) aucs = append(aucs, [auc]) # queried_baseline = order(all_scores, decreasing=True)[0:opts.budget] num_seen_tmp = np.cumsum(seen) # np.cumsum(all_y[queried]) # logger.debug("\nnum_seen : %s" % (str(list(num_seen_tmp)),)) num_seen_baseline = np.cumsum( seen_baseline) # np.cumsum(all_y[queried_baseline]) # logger.debug("Numseen in %d budget (overall):\n%s" % (opts.budget, str(list(num_seen_baseline)))) stream_window_baseline = append( np.array([opts.fid, opts.runidx], dtype=stream_window_baseline.dtype), stream_window_baseline) stream_window = np.ones(len(stream_window_baseline) + 2, dtype=stream_window_tmp.dtype) * -1 stream_window[0:2] = [opts.fid, opts.runidx] stream_window[2:(2 + len(stream_window_tmp))] = stream_window_tmp # queried = append(np.array([opts.fid, opts.runidx], dtype=queried.dtype), queried) # queried_baseline = append(np.array([opts.fid, opts.runidx], dtype=queried_baseline.dtype), queried_baseline) # num_seen_baseline has the uniformly maximum number of queries. # the number of queries in num_seen will vary under the query confidence mode num_seen = np.ones(len(num_seen_baseline) + 2, dtype=num_seen_tmp.dtype) * -1 num_seen[0:2] = [opts.fid, opts.runidx] num_seen[2:(2 + len(num_seen_tmp))] = num_seen_tmp num_seen_baseline = append( np.array([opts.fid, opts.runidx], dtype=num_seen_baseline.dtype), num_seen_baseline) # all_queried = rbind(all_queried, matrix(queried, nrow=1)) # all_queried_baseline = rbind(all_queried_baseline, matrix(queried_baseline, nrow=1)) all_num_seen = rbind(all_num_seen, matrix(num_seen, nrow=1)) all_num_seen_baseline = rbind(all_num_seen_baseline, matrix(num_seen_baseline, nrow=1)) all_window = rbind(all_window, matrix(stream_window, nrow=1)) all_window_baseline = rbind(all_window_baseline, matrix(stream_window_baseline, nrow=1)) logger.debug(tm_run.message("Completed runidx: %d" % runidx)) results = SequentialResults( num_seen=all_num_seen, # true_queried_indexes=all_queried, num_seen_baseline=all_num_seen_baseline, # true_queried_indexes_baseline=all_queried_baseline, stream_window=all_window, stream_window_baseline=all_window_baseline, aucs=aucs) write_sequential_results_to_csv(results, opts)
def aad_ensemble(self, ensemble, opts): if opts.budget == 0: return None x = ensemble.scores y = ensemble.labels n, m = x.shape bt = get_budget_topK(n, opts) metrics = get_alad_metrics_structure(opts.budget, opts) ha = [] hn = [] xis = [] w_unifprior = np.ones(m, dtype=float) w_unifprior = w_unifprior / np.sqrt(w_unifprior.dot(w_unifprior)) # logger.debug("w_prior:") # logger.debug(w_unifprior) qstate = Query.get_initial_query_state(opts.qtype, opts=opts, qrank=bt.topK) metrics.all_weights = np.zeros(shape=(opts.budget, m)) w_unif_prior = self.get_uniform_weights(m) if self.w is None: self.w = w_unif_prior for i in range(bt.budget): starttime_iter = timer() # save the weights in each iteration for later analysis metrics.all_weights[i, :] = self.w metrics.queried = xis # xis keeps growing with each feedback iteration order_anom_idxs = self.order_by_score(x) if True: anom_score = self.get_score(x, self.w) # gather AUC metrics metrics.train_aucs[0, i] = fn_auc(cbind(y, -anom_score)) # gather Precision metrics prec = fn_precision(cbind(y, -anom_score), opts.precision_k) metrics.train_aprs[0, i] = prec[len(opts.precision_k) + 1] train_n_at_top = get_anomalies_at_top(-anom_score, y, opts.precision_k) for k in range(len(opts.precision_k)): metrics.train_precs[k][0, i] = prec[k] metrics.train_n_at_top[k][0, i] = train_n_at_top[k] xi_ = qstate.get_next_query(maxpos=n, ordered_indexes=order_anom_idxs, queried_items=xis, x=x, lbls=y, w=self.w, hf=append(ha, hn), remaining_budget=opts.budget - i) xi = xi_[0] # logger.debug("xi: %d" % (xi,)) xis.append(xi) if opts.single_inst_feedback: # Forget the previous feedback instances and # use only the current feedback for weight updates ha = [] hn = [] if y[xi] == 1: ha.append(xi) else: hn.append(xi) qstate.update_query_state(rewarded=(y[xi] == 1)) if opts.batch: # Use the original (uniform) weights as prior self.w = w_unif_prior hf = np.arange(i) ha = hf[np.where(y[hf] == 1)[0]] hn = hf[np.where(y[hf] == 0)[0]] if opts.unifprior: w_prior = w_unif_prior else: w_prior = self.w tau_rel = opts.constrainttype == AAD_CONSTRAINT_TAU_INSTANCE if opts.detector_type == AAD_IFOREST: self.w = self.if_aad_weight_update(self.w, x, y, hf=append(ha, hn), w_prior=w_prior, opts=opts, tau_rel=tau_rel) elif opts.detector_type == ATGP_IFOREST: w_soln = weight_update_iter_grad(ensemble.scores, ensemble.labels, hf=append(ha, hn), Ca=opts.Ca, Cn=opts.Cn, Cx=opts.Cx, topK=bt.topK, max_iters=1000) self.w = w_soln.w else: raise ValueError("Invalid weight update for IForest: %d" % opts.detector_type) # logger.debug("w_new:") # logger.debug(w_new) if np.mod(i, 1) == 0: endtime_iter = timer() tdiff = difftime(endtime_iter, starttime_iter, units="secs") logger.debug( "Completed [%s] fid %d rerun %d feedback %d in %f sec(s)" % (opts.dataset, opts.fid, opts.runidx, i, tdiff)) return metrics