def get_closest_indexes(inst, test_set, num=1, dest_set=None): n = test_set.shape[0] dists = np.zeros(n) for i in np.arange(n): ts = test_set[i, :] if ts.shape[0] > 1: # dense matrix ts = matrix(ts, nrow=1) diff = inst - ts dist = np.sum(diff**2) else: # sparse matrix diff = inst - ts tmp = diff * diff.T if tmp.shape[0] != 1: raise ValueError("dot product is %s" % str(tmp.shape)) dist = tmp[0, 0] dists[i] = dist ordered = np.argsort(dists)[np.arange(num)] if False: logger.debug("last ts:\n%s" % str(ts)) logger.debug("last diff:\n%s" % str(diff)) logger.debug("ordered indexes: %s" % str(list(ordered))) logger.debug("dists: %s" % str(list(dists[ordered]))) # logger.debug("dists: %s" % str(list(dists))) logger.debug("inst:\n%s" % str(inst)) logger.debug("points:\n%s" % str(test_set[ordered, :])) ts = test_set[ordered[1], :] ts = matrix(ts, nrow=1) logger.debug("dist 2:\n%s" % str(np.sum((inst - ts)**2))) if dest_set is not None: for indx in ordered: dest_set.add(indx) return ordered
def _transform_to_region_features_with_lookup(self, x, x_new): """ Transforms from original feature space to IF node space NOTE: This has been deprecated. Will be removed in future. Performs the conversion tree-by-tree. Even with batching by trees, this requires a lot of intermediate memory. Hence we do not use this method... :param x: :param x_new: :return: """ starttime = timer() n = x_new.shape[0] for i, tree in enumerate(self.clf.estimators_): node_regions = self.all_node_regions[i] for j in range(n): tree_paths = self.get_decision_path(matrix(x[j, :], nrow=1), tree) k = len(tree_paths[0]) for node_idx in tree_paths[0]: region_id = node_regions[node_idx] x_new[ j, region_id] = self.get_region_score_for_instance_transform( region_id, k) if j >= 100000: if j % 20000 == 0: endtime = timer() tdiff = difftime(endtime, starttime, units="secs") logger.debug( "processed %d/%d trees, %d/%d (%f) in %f sec(s)" % (i, len(self.clf.estimators_), j + 1, n, (j + 1) * 1. / n, tdiff))
def get_sgd_batch(x, y, i, batch_size, shuffled_idxs=None): s = i * batch_size e = min(x.shape[0], (i + 1) * batch_size) if shuffled_idxs is None: idxs = np.arange(s, e) else: idxs = shuffled_idxs[np.arange(s, e)] return matrix(x[idxs, :], ncol=x.shape[1]), y[idxs]
def move_unlabeled_to_labeled(self, xi, yi): unlabeled_idx = xi - self.get_num_labeled() self.labeled_x = rbind(self.labeled_x, matrix(self.unlabeled_x[unlabeled_idx], nrow=1)) if self.labeled_y is None: self.labeled_y = np.array([yi], dtype=int) else: self.labeled_y = np.append(self.labeled_y, [yi]) mask = np.ones(self.unlabeled_x.shape[0], dtype=bool) mask[unlabeled_idx] = False self.unlabeled_x = self.unlabeled_x[mask] self.unlabeled_y = self.unlabeled_y[mask]
def plot_aad_2D(x, y, x_forest, xx, yy, forest, metrics, outputdir, dash_xy, dash_wh): # use this to plot the AAD feedback x_test = np.c_[xx.ravel(), yy.ravel()] x_if = forest.transform_to_region_features(x_test, dense=False) queried = np.array(metrics.queried) for i, q in enumerate(queried): pdfpath = "%s/iter_%02d.pdf" % (outputdir, i) dp = DataPlotter(pdfpath=pdfpath, rows=1, cols=1) pl = dp.get_next_plot() w = metrics.all_weights[i, :] Z = forest.get_score(x_if, w) Z = Z.reshape(xx.shape) pl.contourf(xx, yy, Z, 20, cmap=plt.cm.get_cmap('jet')) dp.plot_points(x, pl, labels=y, lbl_color_map={ 0: "grey", 1: "red" }, s=25) # print queried[np.arange(i+1)] # print X_train[queried[np.arange(i+1)], :] dp.plot_points(matrix(x[queried[np.arange(i + 1)], :], nrow=i + 1), pl, labels=y[queried[np.arange(i + 1)]], defaultcol="red", lbl_color_map={ 0: "green", 1: "red" }, edgecolor=None, facecolors=True, marker=matplotlib.markers.MarkerStyle('o', fillstyle=None), s=35) # plot the sidebar anom_scores = forest.get_score(x_forest, w) anom_order = np.argsort(-anom_scores) anom_idxs = np.where(y[anom_order] == 1)[0] dash = 1 - (anom_idxs * 1.0 / x.shape[0]) plot_sidebar(dash, dash_xy, dash_wh, pl) dp.close()
def transform_to_region_features_sparse_bkp(self, x): """ Transforms from original feature space to IF node space The conversion to sparse vectors seems to take a lot of intermediate memory in python. This is why we are converting the vectors in smaller batches. The transformation is a one-time task, hence not a concern in most cases. :param x: :return: """ # logger.debug("transforming to IF feature space...") n = x.shape[0] m = len(self.d) batch_size = 10000 start_batch = 0 end_batch = min(start_batch + batch_size, n) x_new = csr_matrix((0, m), dtype=float) while start_batch < end_batch: starttime = timer() x_tmp = matrix(x[start_batch:end_batch, :], ncol=x.shape[1]) x_tmp_new = lil_matrix((end_batch - start_batch, m), dtype=x_new.dtype) for i, tree in enumerate(self.clf.estimators_): n_tmp = x_tmp.shape[0] node_regions = self.all_node_regions[i] tree_paths = self.get_decision_path(x_tmp, tree) for j in range(n_tmp): k = len(tree_paths[j]) for node_idx in tree_paths[j]: region_id = node_regions[node_idx] x_tmp_new[ j, region_id] = self.get_region_score_for_instance_transform( region_id, k) if n >= 100000: endtime = timer() tdiff = difftime(endtime, starttime, units="secs") logger.debug("processed %d/%d (%f); batch %d in %f sec(s)" % (end_batch + 1, n, (end_batch + 1) * 1. / n, batch_size, tdiff)) x_new = vstack([x_new, x_tmp_new.tocsr()]) start_batch = end_batch end_batch = min(start_batch + batch_size, n) return x_new
def plot_iforest_baseline_contours_2D(x, y, x_iforest, xx, yy, budget, if_model, pdfpath_if_contours, dash_xy, dash_wh): # use this to plot baseline query points. w = np.ones(len(if_model.d), dtype=float) w = w / w.dot(w) # normalized uniform weights baseline_scores = if_model.get_score(x_iforest, w) queried = np.argsort(-baseline_scores) n_found = np.cumsum(y[queried[np.arange(budget)]]) print n_found dp = DataPlotter(pdfpath=pdfpath_if_contours, rows=1, cols=1) pl = dp.get_next_plot() x_test = np.c_[xx.ravel(), yy.ravel()] x_if = if_model.transform_to_region_features(x_test, dense=False) y_if = if_model.get_score(x_if, w) Z = y_if.reshape(xx.shape) pl.contourf(xx, yy, Z, 20) dp.plot_points(x, pl, labels=y, lbl_color_map={0: "grey", 1: "red"}, s=25) # print queried[np.arange(i+1)] # print X_train[queried[np.arange(i+1)], :] dp.plot_points(matrix(x[queried[np.arange(budget)], :], nrow=budget), pl, labels=y[queried[np.arange(budget)]], defaultcol="red", lbl_color_map={ 0: "green", 1: "red" }, edgecolor="black", marker=matplotlib.markers.MarkerStyle('o', fillstyle=None), s=35) # plot the sidebar anom_idxs = np.where(y[queried] == 1)[0] dash = 1 - (anom_idxs * 1.0 / x.shape[0]) plot_sidebar(dash, dash_xy, dash_wh, pl) dp.close()
def plot_forest_contours_2D(x, y, xx, yy, budget, forest, pdfpath_contours, dash_xy, dash_wh): # Original detector contours baseline_scores = 0.5 - forest.decision_function(x) queried = np.argsort(-baseline_scores) # logger.debug("baseline scores:%s\n%s" % (str(baseline_scores.shape), str(list(baseline_scores)))) n_found = np.cumsum(y[queried[np.arange(budget)]]) print n_found Z_if = 0.5 - forest.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z_if = Z_if.reshape(xx.shape) dp = DataPlotter(pdfpath=pdfpath_contours, rows=1, cols=1) pl = dp.get_next_plot() pl.contourf(xx, yy, Z_if, 20, cmap=plt.cm.get_cmap('jet')) dp.plot_points(x, pl, labels=y, lbl_color_map={0: "grey", 1: "red"}) dp.plot_points(matrix(x[queried[np.arange(budget)], :], nrow=budget), pl, labels=y[queried[np.arange(budget)]], defaultcol="red", lbl_color_map={ 0: "green", 1: "red" }, edgecolor="black", marker=matplotlib.markers.MarkerStyle('o', fillstyle=None), s=35) # plot the sidebar anom_idxs = np.where(y[queried] == 1)[0] dash = 1 - (anom_idxs * 1.0 / x.shape[0]) plot_sidebar(dash, dash_xy, dash_wh, pl) dp.close()
def get_tau_ranked_instance(self, x, w, tau_rank): s = self.get_score(x, w) ps = order(s, decreasing=True)[tau_rank] return matrix(x[ps, :], nrow=1)
def main(): if False: # DEBUG args = prepare_forest_aad_debug_args() else: # PRODUCTION args = get_command_args(debug=False) # print "log file: %s" % args.log_file configure_logger(args) opts = Opts(args) # print opts.str_opts() logger.debug(opts.str_opts()) if not opts.streaming: raise ValueError("Only streaming supported") X_full, y_full = read_data(opts) # X_train = X_train[0:10, :] # labels = labels[0:10] logger.debug("loaded file: (%s) %s" % (str(X_full.shape), opts.datafile)) logger.debug("results dir: %s" % opts.resultsdir) all_num_seen = None all_num_seen_baseline = None all_window = None all_window_baseline = None aucs = np.zeros(0, dtype=float) opts.fid = 1 for runidx in opts.get_runidxs(): tm_run = Timer() opts.set_multi_run_options(opts.fid, runidx) stream = DataStream(X_full, y_full) X_train, y_train = stream.read_next_from_stream(opts.stream_window) # logger.debug("X_train:\n%s\nlabels:\n%s" % (str(X_train), str(list(labels)))) model = prepare_aad_model(X_train, y_train, opts) # initial model training sad = StreamingAnomalyDetector(stream, model, unlabeled_x=X_train, unlabeled_y=y_train, max_buffer=opts.stream_window, opts=opts) sad.init_query_state(opts) if False: # use for DEBUG only run_feedback(sad, 0, opts.budget, opts) print "This is experimental/demo code for streaming integration and will be application specific." + \ " Exiting after reading max %d instances from stream and iterating for %d feedback..." % \ (opts.stream_window, opts.budget) exit(0) all_scores = np.zeros(0) all_y = np.zeros(0, dtype=int) scores = sad.get_anomaly_scores(X_train) # auc = fn_auc(cbind(y_train, -scores)) all_scores = np.append(all_scores, scores) all_y = np.append(all_y, y_train) iter = 0 seen = np.zeros(0, dtype=int) seen_baseline = np.zeros(0, dtype=int) stream_window_tmp = np.zeros(0, dtype=int) stream_window_baseline = np.zeros(0, dtype=int) stop_iter = False while not stop_iter: iter += 1 tm = Timer() seen_, seen_baseline_, queried_, queried_baseline_ = run_feedback( sad, opts.min_feedback_per_window, opts.max_feedback_per_window, opts) seen = append(seen, seen_) seen_baseline = append(seen_baseline, seen_baseline_) stream_window_tmp = append(stream_window_tmp, np.ones(len(seen_)) * iter) stream_window_baseline = append( stream_window_baseline, np.ones(len(seen_baseline_)) * iter) # queried = append(queried, queried_) # queried_baseline = append(queried_baseline, queried_baseline_) # logger.debug("seen:\n%s;\nbaseline:\n%s" % (str(list(seen)), str(list(seen_baseline)))) x_eval, y_eval = sad.get_next_from_stream(sad.max_buffer) if x_eval is None or iter >= opts.max_windows: if iter >= opts.max_windows: logger.debug("Exceeded %d iters; exiting stream read..." % opts.max_windows) stop_iter = True else: scores = sad.get_anomaly_scores( x_eval) # compute scores before updating the model all_scores = np.append(all_scores, scores) all_y = np.append(all_y, y_eval) if opts.allow_stream_update: sad.update_model_from_buffer() sad.move_buffer_to_unlabeled() logger.debug( tm.message( "Stream window [%d]: algo [%d/%d]; baseline [%d/%d]: " % (iter, np.sum(seen), len(seen), np.sum(seen_baseline), len(seen_baseline)))) auc = fn_auc(cbind(all_y, -all_scores)) # logger.debug("AUC: %f" % auc) aucs = append(aucs, [auc]) # queried_baseline = order(all_scores, decreasing=True)[0:opts.budget] num_seen_tmp = np.cumsum(seen) # np.cumsum(all_y[queried]) # logger.debug("\nnum_seen : %s" % (str(list(num_seen_tmp)),)) num_seen_baseline = np.cumsum( seen_baseline) # np.cumsum(all_y[queried_baseline]) # logger.debug("Numseen in %d budget (overall):\n%s" % (opts.budget, str(list(num_seen_baseline)))) stream_window_baseline = append( np.array([opts.fid, opts.runidx], dtype=stream_window_baseline.dtype), stream_window_baseline) stream_window = np.ones(len(stream_window_baseline) + 2, dtype=stream_window_tmp.dtype) * -1 stream_window[0:2] = [opts.fid, opts.runidx] stream_window[2:(2 + len(stream_window_tmp))] = stream_window_tmp # queried = append(np.array([opts.fid, opts.runidx], dtype=queried.dtype), queried) # queried_baseline = append(np.array([opts.fid, opts.runidx], dtype=queried_baseline.dtype), queried_baseline) # num_seen_baseline has the uniformly maximum number of queries. # the number of queries in num_seen will vary under the query confidence mode num_seen = np.ones(len(num_seen_baseline) + 2, dtype=num_seen_tmp.dtype) * -1 num_seen[0:2] = [opts.fid, opts.runidx] num_seen[2:(2 + len(num_seen_tmp))] = num_seen_tmp num_seen_baseline = append( np.array([opts.fid, opts.runidx], dtype=num_seen_baseline.dtype), num_seen_baseline) # all_queried = rbind(all_queried, matrix(queried, nrow=1)) # all_queried_baseline = rbind(all_queried_baseline, matrix(queried_baseline, nrow=1)) all_num_seen = rbind(all_num_seen, matrix(num_seen, nrow=1)) all_num_seen_baseline = rbind(all_num_seen_baseline, matrix(num_seen_baseline, nrow=1)) all_window = rbind(all_window, matrix(stream_window, nrow=1)) all_window_baseline = rbind(all_window_baseline, matrix(stream_window_baseline, nrow=1)) logger.debug(tm_run.message("Completed runidx: %d" % runidx)) results = SequentialResults( num_seen=all_num_seen, # true_queried_indexes=all_queried, num_seen_baseline=all_num_seen_baseline, # true_queried_indexes_baseline=all_queried_baseline, stream_window=all_window, stream_window_baseline=all_window_baseline, aucs=aucs) write_sequential_results_to_csv(results, opts)
def get_gp_predictions_ext(x, y, ranked_indexes, orig_train=None, orig_test=None, queried_indexes=None, test_set=None, n_train=100, n_test=20, n_closest=9): s = 0.005 # noise variance. n_train = min(n_train, x.shape[0]) train_indexes_all = SetList(ranked_indexes[np.arange(n_train)]) if False: logger.debug("all train indexes:\n%s" % str(list(train_indexes_all))) # if a separate test set has *not* been provided, then it is the # leave-one-out case where we compute variance for each test instance # by first training on other instances and then computing the mean # nd variance for the left-out test instance. leave_one_out = test_set is None test_indexes_all = np.array(train_indexes_all) if queried_indexes is not None: # test instances can only be unlabeled instances test_indexes_all = np.array( SetList(train_indexes_all) - SetList(queried_indexes)) L = None if leave_one_out: pred_indexes = np.arange(n_test) test_indexes_all = test_indexes_all[np.arange(n_test)] if False: logger.debug("all test indexes:%d\n%s" % (n_test, str(list(test_indexes_all)))) y_pred = None # np.zeros(len(pred_indexes)) v_pred = np.ones(len(pred_indexes)) * 0.5 else: closest_indexes = set( ) # all indexes from test_set that are closest to any unlabeled instances for i in range(n_test): test_index = test_indexes_all[i] if orig_train is not None and orig_test is not None: get_closest_indexes(matrix(orig_train[test_index, :], nrow=1), orig_test, num=n_closest, dest_set=closest_indexes) else: get_closest_indexes(x[test_index, :], test_set, num=n_closest, dest_set=closest_indexes) pred_indexes = np.array(list(closest_indexes)) if False: logger.debug("pred indexes:\n%s" % str(list(pred_indexes))) y_pred = None # np.zeros(test_set.shape[0]) v_pred = np.ones(test_set.shape[0]) * 0.5 n_pred = len(pred_indexes) logger.debug("Leave-one-out: %s, n_pred: %d" % (str(leave_one_out), n_pred)) tm = Timer() for cnt, i in enumerate(pred_indexes): # pick one test instance if leave_one_out: # this is the leave-out-out case test_index = test_indexes_all[i] x_test = x[test_index, :] # exclude the test instance from the training train_indexes = np.array(train_indexes_all - SetList([test_index])) else: x_test = test_set[i, :] train_indexes = train_indexes_all x_train = x[train_indexes, :] # y_train = y[train_indexes] + s*np.random.randn(len(train_indexes)) if leave_one_out or cnt == 0: # the matrix needs to be recomputed in each iteration # for the leave-one-out case, else it should be # computed only once. K = kernel(x_train, x_train) L = np.linalg.cholesky(K + s * np.eye(K.shape[0])) logger.debug("K:\n%s" % str(K)) # compute the mean at our test points. Lk = np.linalg.solve(L, kernel(x_train, x_test)) # y_pred[i] = np.dot(Lk.T, np.linalg.solve(L, y_train)) # compute the variance at our test points. K_ = kernel(x_test, x_test) if (cnt + 1) % 200 == 0: logger.debug("Test Kernel (%d, %d)" % (K_.shape[0], K_.shape[1])) s2 = np.diag(K_) - np.sum(Lk**2, axis=0) v_pred[i] = np.sqrt(s2) tm.end() logger.debug(tm.message("Time for GP compuation:")) if False: if y_pred is not None: logger.debug("predicted means:\n%s" % str(list(y_pred))) logger.debug("predicted variances:\n%s" % str(list(v_pred))) return y_pred, v_pred, train_indexes_all, test_indexes_all[np.arange( n_test)]
def get_gp_predictions(x, y, ordered_indexes, queried_indexes=None, n_train=100, n_test=20, length_scale=20, orig_x=None, eval_set=None, orig_eval_set=None, n_closest=9): s = 0.005 # noise variance. top_ranked_indexes = ordered_indexes[np.arange( max(n_train, len(queried_indexes)) + n_test)] train, test = get_gp_train_test(top_ranked_indexes, queried_indexes, n_train, n_test) n_train = len(train) # this value might be different from input n_test = len(test) # logger.debug("train indexes:\n%s\ntest indexes:\n%s" % (str(list(train)), str(list(test)))) y_pred = None # np.zeros(len(pred_indexes)) v_pred = np.ones(len(test)) * 0.5 x_train = x[train, :] # y_train = y[train_indexes] + s*np.random.randn(len(train_indexes)) K = kernel(x_train, x_train, length_scale=length_scale) L = np.linalg.cholesky(K + s * np.eye(K.shape[0])) # logger.debug("K:\n%s" % str(K)) tm = Timer() for i, idx in enumerate(test): x_test = x[idx, :] # compute the mean at our test points. Lk = np.linalg.solve( L, kernel(x_train, x_test, length_scale=length_scale)) # y_pred[i] = np.dot(Lk.T, np.linalg.solve(L, y_train)) # compute the variance at our test points. K_ = kernel(x_test, x_test, length_scale=length_scale) if (i + 1) % 200 == 0: logger.debug("Test Kernel (%d, %d)" % (K_.shape[0], K_.shape[1])) s2 = np.diag(K_) - np.sum(Lk**2, axis=0) v_pred[i] = np.sqrt(s2) tm.end() logger.debug(tm.message("Time for GP computation on test set:")) if False: if y_pred is not None: logger.debug("predicted means:\n%s" % str(list(y_pred))) logger.debug("predicted variances:\n%s" % str(list(v_pred))) v_eval = None if eval_set is not None: tm = Timer() closest_indexes = set( ) # all indexes from test_set that are closest to any unlabeled instances for i in range(n_test): test_index = test[i] if orig_x is not None and orig_eval_set is not None: get_closest_indexes(matrix(orig_x[test_index, :], nrow=1), orig_eval_set, num=n_closest, dest_set=closest_indexes) else: get_closest_indexes(x[test_index, :], eval_set, num=n_closest, dest_set=closest_indexes) v_eval = np.ones(eval_set.shape[0], dtype=float) * 0.5 for i, idx in enumerate(closest_indexes): x_test = eval_set[idx, :] # compute the mean at our test points. Lk = np.linalg.solve( L, kernel(x_train, x_test, length_scale=length_scale)) # y_pred[i] = np.dot(Lk.T, np.linalg.solve(L, y_train)) # compute the variance at our test points. K_ = kernel(x_test, x_test, length_scale=length_scale) if (i + 1) % 200 == 0: logger.debug("Test Kernel (%d, %d)" % (K_.shape[0], K_.shape[1])) s2 = np.diag(K_) - np.sum(Lk**2, axis=0) v_eval[idx] = np.sqrt(s2) logger.debug(tm.message("Time for GP compuation on eval set:")) return y_pred, v_pred, train, test, v_eval
def plot_aad_gp(x, y, x_forest, xx, yy, forest, metrics, outputdir, dash_xy, dash_wh): # use this to plot the AAD feedback x_test = np.c_[xx.ravel(), yy.ravel()] x_test_forest = forest.transform_to_region_features(x_test, dense=False) queried = np.array(metrics.queried) for i, q in enumerate(queried): pdfpath = "%s/gp_iter_%02d.pdf" % (outputdir, i) dp = DataPlotter(pdfpath=pdfpath, rows=1, cols=1) pl = dp.get_next_plot() w = metrics.all_weights[i, :] s_train = forest.get_score(x_forest, w) ranked_indexes = np.argsort(-s_train, ) # s_test = forest.get_score(x_test_forest, w) gp_eval_set = x_test_forest gp_score, gp_var, train_indexes, test_indexes, v_eval = \ get_gp_predictions(x=x_forest, y=s_train, orig_x=x, ordered_indexes=ranked_indexes, queried_indexes=queried, n_train=100, n_test=30, length_scale=40, eval_set=gp_eval_set, orig_eval_set=x_test, n_closest=9) logger.debug("gp_var:\n%s\ntest_indexes:\n%s" % (str(list(gp_var)), str(list(test_indexes)))) if gp_eval_set is not None: Z = v_eval.reshape(xx.shape) levels = np.linspace(0., 1., 20) CS = pl.contourf(xx, yy, Z, levels, cmap=plt.cm.get_cmap('jet')) cbar = plt.colorbar(CS) cbar.ax.set_ylabel('score variance') dp.plot_points(x, pl, labels=y, lbl_color_map={ 0: "grey", 1: "red" }, s=25) dp.plot_points(x[train_indexes, :], pl, marker='o', defaultcol='blue', s=35, edgecolor='blue', facecolors='none') dp.plot_points(x[test_indexes, :], pl, marker='o', defaultcol='magenta', s=60, edgecolor='magenta', facecolors='none') # print queried[np.arange(i+1)] # print X_train[queried[np.arange(i+1)], :] dp.plot_points(matrix(x[queried[np.arange(i + 1)], :], nrow=i + 1), pl, labels=y[queried[np.arange(i + 1)]], defaultcol="red", lbl_color_map={ 0: "green", 1: "red" }, edgecolor="black", marker=matplotlib.markers.MarkerStyle('o', fillstyle=None), s=35) # plot the sidebar anom_scores = forest.get_score(x_forest, w) anom_order = np.argsort(-anom_scores) anom_idxs = np.where(y[anom_order] == 1)[0] dash = 1 - (anom_idxs * 1.0 / x.shape[0]) plot_sidebar(dash, dash_xy, dash_wh, pl) dp.close()
def plot_aad_score_var(x, y, x_forest, xx, yy, forest, metrics, outputdir, dash_xy, dash_wh): # use this to plot the AAD feedback x_test = np.c_[xx.ravel(), yy.ravel()] x_test_forest = forest.transform_to_region_features(x_test, dense=False) queried = np.array(metrics.queried) for i, q in enumerate(queried): pdfpath = "%s/score_iter_%02d.pdf" % (outputdir, i) dp = DataPlotter(pdfpath=pdfpath, rows=1, cols=1) pl = dp.get_next_plot() w = metrics.all_weights[i, :] s_train = forest.get_score(x_forest, w) ranked_indexes = np.argsort(-s_train, ) # s_test = forest.get_score(x_test_forest, w) test_indexes = metrics.test_indexes[i] score_eval_set = x_test_forest score_mean, score_var, test_indexes, v_eval, _ = \ get_score_variances(x=x_forest, w=w, n_test=len(test_indexes) if test_indexes is not None else 10, ordered_indexes=ranked_indexes, queried_indexes=queried, test_indexes=test_indexes, eval_set=score_eval_set, n_closest=9) qpos = np.argmax(score_var) q = test_indexes[qpos] logger.debug("score_var:\n%s\ntest_indexes:\n%s" % (str(list(score_var)), str(list(test_indexes)))) logger.debug( "qpos: %d, query instance: %d, var: %f, queried:%s" % (qpos, q, score_var[qpos], str(list(queried[np.arange(i)])))) if score_eval_set is not None: Z = v_eval.reshape(xx.shape) levels = np.linspace(np.min(v_eval), np.max(v_eval), 20) CS = pl.contourf(xx, yy, Z, levels, cmap=plt.cm.get_cmap('jet')) cbar = plt.colorbar(CS) cbar.ax.set_ylabel('score variance') dp.plot_points(x, pl, labels=y, lbl_color_map={ 0: "grey", 1: "red" }, s=25) dp.plot_points(x[test_indexes, :], pl, marker='o', defaultcol='magenta', s=60, edgecolor='magenta', facecolors='none') dp.plot_points(matrix(x[queried[np.arange(i + 1)], :], nrow=i + 1), pl, labels=y[queried[np.arange(i + 1)]], defaultcol="red", lbl_color_map={ 0: "green", 1: "red" }, edgecolor=None, facecolors=True, marker=matplotlib.markers.MarkerStyle('o', fillstyle=None), s=35) # plot the sidebar anom_scores = forest.get_score(x_forest, w) anom_order = np.argsort(-anom_scores) anom_idxs = np.where(y[anom_order] == 1)[0] dash = 1 - (anom_idxs * 1.0 / x.shape[0]) plot_sidebar(dash, dash_xy, dash_wh, pl) dp.close()