コード例 #1
0
ファイル: data_plotter.py プロジェクト: sanowi/pyaad
def plot_sidebar(hts, dash_xy, dash_wh, pl):
    pl.add_patch(
        plt.Rectangle(dash_xy,
                      dash_wh[0],
                      dash_wh[1],
                      facecolor='white',
                      edgecolor='black',
                      alpha=1))
    dash_pts = cbind(
        np.ones(len(hts), dtype=float) * (dash_wh[0] / 2) + dash_xy[0],
        hts * dash_wh[1] + dash_xy[1])
    # print dash_pts
    pl.plot(dash_pts[:, 0],
            dash_pts[:, 1],
            'ro',
            markersize=3,
            markerfacecolor='red')
コード例 #2
0
    def aad_learn_ensemble_weights_with_budget(self, ensemble, opts):

        if opts.budget == 0:
            return None

        x = ensemble.scores
        y = ensemble.labels

        n, m = x.shape
        bt = get_budget_topK(n, opts)

        metrics = get_alad_metrics_structure(opts.budget, opts)
        ha = []
        hn = []
        xis = []

        qstate = Query.get_initial_query_state(opts.qtype,
                                               opts=opts,
                                               qrank=bt.topK,
                                               a=1.,
                                               b=1.,
                                               budget=bt.budget)

        metrics.all_weights = np.zeros(shape=(opts.budget, m))

        w_unif_prior = self.get_uniform_weights(m)
        if self.w is None:
            self.w = w_unif_prior

        for i in range(bt.budget):

            starttime_iter = timer()

            # save the weights in each iteration for later analysis
            metrics.all_weights[i, :] = self.w
            metrics.queried = xis  # xis keeps growing with each feedback iteration

            order_anom_idxs, anom_score = self.order_by_score(x, self.w)

            if False and y is not None and metrics is not None:
                # gather AUC metrics
                metrics.train_aucs[0, i] = fn_auc(cbind(y, -anom_score))

                # gather Precision metrics
                prec = fn_precision(cbind(y, -anom_score), opts.precision_k)
                metrics.train_aprs[0, i] = prec[len(opts.precision_k) + 1]
                train_n_at_top = get_anomalies_at_top(-anom_score, y,
                                                      opts.precision_k)
                for k in range(len(opts.precision_k)):
                    metrics.train_precs[k][0, i] = prec[k]
                    metrics.train_n_at_top[k][0, i] = train_n_at_top[k]

            xi_ = qstate.get_next_query(maxpos=n,
                                        ordered_indexes=order_anom_idxs,
                                        queried_items=xis,
                                        x=x,
                                        lbls=y,
                                        y=anom_score,
                                        w=self.w,
                                        hf=append(ha, hn),
                                        remaining_budget=opts.budget - i)
            # logger.debug("xi: %d" % (xi,))
            xi = xi_[0]
            xis.append(xi)
            metrics.test_indexes.append(qstate.test_indexes)

            if opts.single_inst_feedback:
                # Forget the previous feedback instances and
                # use only the current feedback for weight updates
                ha = []
                hn = []

            if y[xi] == 1:
                ha.append(xi)
            else:
                hn.append(xi)

            qstate.update_query_state(rewarded=(y[xi] == 1))

            if opts.batch:
                # Use the original (uniform) weights as prior
                # This is an experimental option ...
                w = self.w_unif_prior
                hf = order_anom_idxs[0:i]
                ha = hf[np.where(y[hf] == 1)[0]]
                hn = hf[np.where(y[hf] == 0)[0]]

            self.update_weights(x, y, ha=ha, hn=hn, opts=opts, w=self.w)

            if np.mod(i, 1) == 0:
                endtime_iter = timer()
                tdiff = difftime(endtime_iter, starttime_iter, units="secs")
                logger.debug(
                    "Completed [%s] fid %d rerun %d feedback %d in %f sec(s)" %
                    (opts.dataset, opts.fid, opts.runidx, i, tdiff))

        return metrics
コード例 #3
0
def main():

    if False:
        # DEBUG
        args = prepare_forest_aad_debug_args()
    else:
        # PRODUCTION
        args = get_command_args(debug=False)
    # print "log file: %s" % args.log_file
    configure_logger(args)

    opts = Opts(args)
    # print opts.str_opts()
    logger.debug(opts.str_opts())

    if not opts.streaming:
        raise ValueError("Only streaming supported")

    X_full, y_full = read_data(opts)
    # X_train = X_train[0:10, :]
    # labels = labels[0:10]

    logger.debug("loaded file: (%s) %s" % (str(X_full.shape), opts.datafile))
    logger.debug("results dir: %s" % opts.resultsdir)

    all_num_seen = None
    all_num_seen_baseline = None
    all_window = None
    all_window_baseline = None

    aucs = np.zeros(0, dtype=float)

    opts.fid = 1
    for runidx in opts.get_runidxs():
        tm_run = Timer()
        opts.set_multi_run_options(opts.fid, runidx)

        stream = DataStream(X_full, y_full)
        X_train, y_train = stream.read_next_from_stream(opts.stream_window)

        # logger.debug("X_train:\n%s\nlabels:\n%s" % (str(X_train), str(list(labels))))

        model = prepare_aad_model(X_train, y_train,
                                  opts)  # initial model training
        sad = StreamingAnomalyDetector(stream,
                                       model,
                                       unlabeled_x=X_train,
                                       unlabeled_y=y_train,
                                       max_buffer=opts.stream_window,
                                       opts=opts)
        sad.init_query_state(opts)

        if False:
            # use for DEBUG only
            run_feedback(sad, 0, opts.budget, opts)
            print "This is experimental/demo code for streaming integration and will be application specific." + \
                  " Exiting after reading max %d instances from stream and iterating for %d feedback..." % \
                    (opts.stream_window, opts.budget)
            exit(0)

        all_scores = np.zeros(0)
        all_y = np.zeros(0, dtype=int)

        scores = sad.get_anomaly_scores(X_train)
        # auc = fn_auc(cbind(y_train, -scores))
        all_scores = np.append(all_scores, scores)
        all_y = np.append(all_y, y_train)
        iter = 0
        seen = np.zeros(0, dtype=int)
        seen_baseline = np.zeros(0, dtype=int)
        stream_window_tmp = np.zeros(0, dtype=int)
        stream_window_baseline = np.zeros(0, dtype=int)
        stop_iter = False
        while not stop_iter:
            iter += 1

            tm = Timer()
            seen_, seen_baseline_, queried_, queried_baseline_ = run_feedback(
                sad, opts.min_feedback_per_window,
                opts.max_feedback_per_window, opts)
            seen = append(seen, seen_)
            seen_baseline = append(seen_baseline, seen_baseline_)
            stream_window_tmp = append(stream_window_tmp,
                                       np.ones(len(seen_)) * iter)
            stream_window_baseline = append(
                stream_window_baseline,
                np.ones(len(seen_baseline_)) * iter)
            # queried = append(queried, queried_)
            # queried_baseline = append(queried_baseline, queried_baseline_)
            # logger.debug("seen:\n%s;\nbaseline:\n%s" % (str(list(seen)), str(list(seen_baseline))))

            x_eval, y_eval = sad.get_next_from_stream(sad.max_buffer)
            if x_eval is None or iter >= opts.max_windows:
                if iter >= opts.max_windows:
                    logger.debug("Exceeded %d iters; exiting stream read..." %
                                 opts.max_windows)
                stop_iter = True
            else:
                scores = sad.get_anomaly_scores(
                    x_eval)  # compute scores before updating the model

                all_scores = np.append(all_scores, scores)
                all_y = np.append(all_y, y_eval)

                if opts.allow_stream_update:
                    sad.update_model_from_buffer()

                sad.move_buffer_to_unlabeled()

            logger.debug(
                tm.message(
                    "Stream window [%d]: algo [%d/%d]; baseline [%d/%d]: " %
                    (iter, np.sum(seen), len(seen), np.sum(seen_baseline),
                     len(seen_baseline))))

        auc = fn_auc(cbind(all_y, -all_scores))
        # logger.debug("AUC: %f" % auc)
        aucs = append(aucs, [auc])

        # queried_baseline = order(all_scores, decreasing=True)[0:opts.budget]
        num_seen_tmp = np.cumsum(seen)  # np.cumsum(all_y[queried])
        # logger.debug("\nnum_seen    : %s" % (str(list(num_seen_tmp)),))

        num_seen_baseline = np.cumsum(
            seen_baseline)  # np.cumsum(all_y[queried_baseline])
        # logger.debug("Numseen in %d budget (overall):\n%s" % (opts.budget, str(list(num_seen_baseline))))

        stream_window_baseline = append(
            np.array([opts.fid, opts.runidx],
                     dtype=stream_window_baseline.dtype),
            stream_window_baseline)
        stream_window = np.ones(len(stream_window_baseline) + 2,
                                dtype=stream_window_tmp.dtype) * -1
        stream_window[0:2] = [opts.fid, opts.runidx]
        stream_window[2:(2 + len(stream_window_tmp))] = stream_window_tmp

        # queried = append(np.array([opts.fid, opts.runidx], dtype=queried.dtype), queried)
        # queried_baseline = append(np.array([opts.fid, opts.runidx], dtype=queried_baseline.dtype), queried_baseline)

        # num_seen_baseline has the uniformly maximum number of queries.
        # the number of queries in num_seen will vary under the query confidence mode
        num_seen = np.ones(len(num_seen_baseline) + 2,
                           dtype=num_seen_tmp.dtype) * -1
        num_seen[0:2] = [opts.fid, opts.runidx]
        num_seen[2:(2 + len(num_seen_tmp))] = num_seen_tmp

        num_seen_baseline = append(
            np.array([opts.fid, opts.runidx], dtype=num_seen_baseline.dtype),
            num_seen_baseline)

        # all_queried = rbind(all_queried, matrix(queried, nrow=1))
        # all_queried_baseline = rbind(all_queried_baseline, matrix(queried_baseline, nrow=1))

        all_num_seen = rbind(all_num_seen, matrix(num_seen, nrow=1))
        all_num_seen_baseline = rbind(all_num_seen_baseline,
                                      matrix(num_seen_baseline, nrow=1))
        all_window = rbind(all_window, matrix(stream_window, nrow=1))
        all_window_baseline = rbind(all_window_baseline,
                                    matrix(stream_window_baseline, nrow=1))

        logger.debug(tm_run.message("Completed runidx: %d" % runidx))

    results = SequentialResults(
        num_seen=all_num_seen,
        # true_queried_indexes=all_queried,
        num_seen_baseline=all_num_seen_baseline,
        # true_queried_indexes_baseline=all_queried_baseline,
        stream_window=all_window,
        stream_window_baseline=all_window_baseline,
        aucs=aucs)
    write_sequential_results_to_csv(results, opts)
コード例 #4
0
    def aad_ensemble(self, ensemble, opts):

        if opts.budget == 0:
            return None

        x = ensemble.scores
        y = ensemble.labels

        n, m = x.shape
        bt = get_budget_topK(n, opts)

        metrics = get_alad_metrics_structure(opts.budget, opts)
        ha = []
        hn = []
        xis = []

        w_unifprior = np.ones(m, dtype=float)
        w_unifprior = w_unifprior / np.sqrt(w_unifprior.dot(w_unifprior))
        # logger.debug("w_prior:")
        # logger.debug(w_unifprior)

        qstate = Query.get_initial_query_state(opts.qtype,
                                               opts=opts,
                                               qrank=bt.topK)

        metrics.all_weights = np.zeros(shape=(opts.budget, m))

        w_unif_prior = self.get_uniform_weights(m)
        if self.w is None:
            self.w = w_unif_prior

        for i in range(bt.budget):

            starttime_iter = timer()

            # save the weights in each iteration for later analysis
            metrics.all_weights[i, :] = self.w
            metrics.queried = xis  # xis keeps growing with each feedback iteration

            order_anom_idxs = self.order_by_score(x)

            if True:
                anom_score = self.get_score(x, self.w)
                # gather AUC metrics
                metrics.train_aucs[0, i] = fn_auc(cbind(y, -anom_score))

                # gather Precision metrics
                prec = fn_precision(cbind(y, -anom_score), opts.precision_k)
                metrics.train_aprs[0, i] = prec[len(opts.precision_k) + 1]
                train_n_at_top = get_anomalies_at_top(-anom_score, y,
                                                      opts.precision_k)
                for k in range(len(opts.precision_k)):
                    metrics.train_precs[k][0, i] = prec[k]
                    metrics.train_n_at_top[k][0, i] = train_n_at_top[k]

            xi_ = qstate.get_next_query(maxpos=n,
                                        ordered_indexes=order_anom_idxs,
                                        queried_items=xis,
                                        x=x,
                                        lbls=y,
                                        w=self.w,
                                        hf=append(ha, hn),
                                        remaining_budget=opts.budget - i)
            xi = xi_[0]
            # logger.debug("xi: %d" % (xi,))
            xis.append(xi)

            if opts.single_inst_feedback:
                # Forget the previous feedback instances and
                # use only the current feedback for weight updates
                ha = []
                hn = []

            if y[xi] == 1:
                ha.append(xi)
            else:
                hn.append(xi)

            qstate.update_query_state(rewarded=(y[xi] == 1))

            if opts.batch:
                # Use the original (uniform) weights as prior
                self.w = w_unif_prior
                hf = np.arange(i)
                ha = hf[np.where(y[hf] == 1)[0]]
                hn = hf[np.where(y[hf] == 0)[0]]

            if opts.unifprior:
                w_prior = w_unif_prior
            else:
                w_prior = self.w

            tau_rel = opts.constrainttype == AAD_CONSTRAINT_TAU_INSTANCE
            if opts.detector_type == AAD_IFOREST:
                self.w = self.if_aad_weight_update(self.w,
                                                   x,
                                                   y,
                                                   hf=append(ha, hn),
                                                   w_prior=w_prior,
                                                   opts=opts,
                                                   tau_rel=tau_rel)
            elif opts.detector_type == ATGP_IFOREST:
                w_soln = weight_update_iter_grad(ensemble.scores,
                                                 ensemble.labels,
                                                 hf=append(ha, hn),
                                                 Ca=opts.Ca,
                                                 Cn=opts.Cn,
                                                 Cx=opts.Cx,
                                                 topK=bt.topK,
                                                 max_iters=1000)
                self.w = w_soln.w
            else:
                raise ValueError("Invalid weight update for IForest: %d" %
                                 opts.detector_type)
            # logger.debug("w_new:")
            # logger.debug(w_new)

            if np.mod(i, 1) == 0:
                endtime_iter = timer()
                tdiff = difftime(endtime_iter, starttime_iter, units="secs")
                logger.debug(
                    "Completed [%s] fid %d rerun %d feedback %d in %f sec(s)" %
                    (opts.dataset, opts.fid, opts.runidx, i, tdiff))

        return metrics