def plot_decision_tree_descriptions(x, y, name): ha = np.where(y == 1)[0] # generate compact descriptions from the Decision Tree classifier; # these just correspond to the rules extracted from the tree structure dt = DecisionTreeAadWrapper(x, y) ridxs_counts, region_extents = describe_instances(x, ha, model=dt, opts=opts) logger.debug("selected decision tree region indexes and corresponding instance counts (among %d):\n%s" % (len(ha), str(list(ridxs_counts)))) region_indexes = [region for region, count in ridxs_counts] plot_dataset(x, y, opts, test_points=None, name="%s_decision_tree_descriptions" % name, model=dt, region_indexes=region_indexes, legend=False)
def train_classifier(x, y, opts, test_points, name, explain=False, interpretable=False): classifier = RFClassifier.fit(x, y, n_estimators=100, max_depth=None) xx, yy = np.meshgrid(np.linspace(-4, 8, 50), np.linspace(-4, 8, 50)) x_test = np.c_[xx.ravel(), yy.ravel()] probs_grid = classifier.predict_prob_for_class(x_test, 1) Z = probs_grid.reshape(xx.shape) pdfpath = "%s/%s.pdf" % (opts.resultsdir, "%s_classifier" % name) logger.debug("Plotting classifier contours to %s" % (pdfpath)) dp = DataPlotter(pdfpath=pdfpath, rows=1, cols=1) pl = dp.get_next_plot() pl.contourf(xx, yy, Z, 20, cmap=plt.cm.get_cmap('jet')) dp.plot_points(x, pl, labels=y, lbl_color_map={0: "grey", 1: "red"}, s=25) # sidebar coordinates and dimensions for showing rank locations of true anomalies dash_xy = (-4.0, -2.0) # bottom-left (x,y) coordinates dash_wh = (0.4, 8) # width, height # plot the sidebar anom_scores = classifier.predict_prob_for_class(x, 1) anom_order = np.argsort(-anom_scores) anom_idxs = np.where(y[anom_order] == 1)[0] dash = 1 - (anom_idxs * 1.0 / x.shape[0]) plot_sidebar(dash, dash_xy, dash_wh, pl) dp.plot_points(test_points, pl, marker='x', defaultcol='green', s=50, linewidths=2.) dp.plot_points(test_points, pl, marker='o', edgecolor='green', defaultcol='green', s=60, linewidths=2.) dp.close() if explain: # generate compact descriptions from the Random Forest classifier rfre = RandomForestAadWrapper(x, y, classifier.clf) ha = np.where(y == 1)[0] ridxs_counts, region_extents = describe_instances(x, ha, model=rfre, opts=opts, interpretable=interpretable) logger.debug("selected random forest region indexes and corresponding instance counts (among %d):\n%s" % (len(ha), str(list(ridxs_counts)))) region_indexes = [region for region, count in ridxs_counts] plot_dataset(x, y, opts, test_points=None, name="%s_random_forest_descriptions" % name, model=rfre, region_indexes=region_indexes, legend=False)
def train_anomaly_detector(x, y, opts, test_points, name, explain=False): rng = np.random.RandomState(opts.randseed) # fit the model model = get_aad_model(x, opts, rng) model.fit(x) model.init_weights(INIT_UNIF) # train model with labeled examples x_transformed = model.transform_to_ensemble_features( x, dense=False, norm_unit=opts.norm_unit) ha = np.where(y == 1)[0] hn = np.where(y == 0)[0] # hn = np.zeros(0, dtype=int) # we know the true anomaly fraction from the fully labeled data opts.tau = len(ha) * 1.0 / len(y) auc = get_auc(model, x=x, y=y, x_transformed=x_transformed) logger.debug("AUC[0]: %f" % (auc)) best_i = 0 best_auc = auc best_w = model.w for i in range(opts.n_pretrain): model.update_weights(x_transformed, y, ha, hn, opts) auc = get_auc(model, x=x, y=y, x_transformed=x_transformed) logger.debug("AUC[%d]: %f" % (i + 1, auc)) if best_auc <= auc: best_auc = auc best_w = np.copy(model.w) best_i = i + 1 logger.debug("best_i: %d, best_auc: %f" % (best_i, best_auc)) model.w = best_w pdfpath = "%s/%s.pdf" % (opts.resultsdir, "%s_anomaly" % name) logger.debug("Plotting aad contours to %s" % (pdfpath)) dp = DataPlotter(pdfpath=pdfpath, rows=1, cols=1) pl = dp.get_next_plot() xx, yy = np.meshgrid(np.linspace(-4, 8, 50), np.linspace(-4, 8, 50)) x_test = np.c_[xx.ravel(), yy.ravel()] x_test_transformed = model.transform_to_ensemble_features( x_test, dense=False, norm_unit=opts.norm_unit) Z = model.get_score(x_test_transformed) Z = Z.reshape(xx.shape) pl.contourf(xx, yy, Z, 20, cmap=plt.cm.get_cmap('jet')) dp.plot_points(x, pl, labels=y, lbl_color_map={0: "grey", 1: "red"}, s=25) # sidebar coordinates and dimensions for showing rank locations of true anomalies dash_xy = (-4.0, -2.0) # bottom-left (x,y) coordinates dash_wh = (0.4, 8) # width, height # plot the sidebar anom_scores = model.get_score(x_transformed) anom_order = np.argsort(-anom_scores) anom_idxs = np.where(y[anom_order] == 1)[0] dash = 1 - (anom_idxs * 1.0 / x.shape[0]) plot_sidebar(dash, dash_xy, dash_wh, pl) dp.plot_points(test_points, pl, marker='x', defaultcol='green', s=50, linewidths=2.) dp.plot_points(test_points, pl, marker='o', edgecolor='green', defaultcol='green', s=60, linewidths=2.) dp.close() if explain: ridxs_counts, region_extents = describe_instances(x, np.array(ha), model=model, opts=opts) logger.debug( "selected region indexes and corresponding instance counts (among %d):\n%s" % (len(ha), str(list(ridxs_counts)))) region_indexes = [region for region, count in ridxs_counts] plot_dataset(x, y, opts, test_points=None, name="%s_anomaly_descriptions" % name, model=model, region_indexes=region_indexes, legend=False)