Ejemplo n.º 1
0
def anomaly_visual_experiment_online(dataset, method, generate_synthetic_datasets_again, predict_anomaly_again, predict_using_threshold):
    # https://scikit-learn.org/stable/auto_examples/plot_anomaly_comparison.html#sphx-glr-auto-examples-plot-anomaly-comparison-py
    # settings:
    n_samples = 300
    outliers_fraction = 0.15
    n_outliers = int(outliers_fraction * n_samples)
    n_inliers = n_samples - n_outliers
    xx, yy = np.meshgrid(np.linspace(-7, 7, 150), np.linspace(-7, 7, 150))
    rng = np.random.RandomState(42)
    # dataset:
    path_dataset = './datasets/' + dataset + "/"
    if generate_synthetic_datasets_again:
        if dataset == "two_moons":
            X = 4. * (make_moons(n_samples=n_inliers, noise=.05, random_state=0)[0] - np.array([0.5, 0.25]))
        elif dataset == "one_blob":
            X = make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5)[0]
            n_inliers = X.shape[0]
        elif dataset == "two_blobs":
            X = make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5])[0]
            n_inliers = X.shape[0]
        elif dataset == "two_different_blobs":
            X = make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3])[0]
            n_inliers = X.shape[0]
        save_variable(variable=X, name_of_variable="X", path_to_save=path_dataset)
    else:
        X = load_variable(name_of_variable="X", path=path_dataset)
        n_inliers = X.shape[0]
    # Add outliers:
    X = np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))], axis=0)
    y = [1] * n_inliers
    y.extend([-1] * n_outliers)
    y = np.asarray(y).ravel()
    # split dataset for online stages:
    n_online_stages = 5
    X_stages, y_stages = stratified_split_of_data(X, y, n_online_stages=n_online_stages)
    # anomaly detection:
    if method == "iMondrian_forest":
        if predict_using_threshold:
            path_save = "./saved_files/online/" + method + "/threshold/" + dataset + "/"
        else:
            path_save = "./saved_files/online/" + method + "/Kmeans/" + dataset + "/"
    else:
        path_save = "./saved_files/online/" + method + "/" + dataset + "/"
    if predict_anomaly_again:
        if method == "iMondrian_forest":
            settings, data, param, cache, train_ids_current_minibatch = MondrianForest.prepare_training_data(X=X_stages[0], num_trees=100)
            mf = MondrianForest(settings, data)
            subsampling_size = 256
            mf.fit(data, train_ids_current_minibatch, settings, param, cache, subsampling_size=None)
            y_pred_of_stages = [None] * n_online_stages
            scores_of_stages = [None] * n_online_stages
            Z_of_stages = [None] * n_online_stages
            X_all_in_this_stage = np.empty((0, X.shape[1]))
            X_in_stages = [None] * n_online_stages
            for stage in range(n_online_stages):
                print("Online stage " + str(stage) + "...")
                if stage != 0:
                    data, train_ids_current_minibatch = MondrianForest.prepare_new_training_data(X_train=X_stages[0], X_new=X_stages[stage])
                    mf.partial_fit(data=data, train_ids_current_minibatch=train_ids_current_minibatch, settings=settings, param=param, cache=cache)
                X_all_in_this_stage = np.vstack((X_all_in_this_stage, X_stages[stage]))
                scores_training, _ = mf.get_anomaly_scores(test_data=X_all_in_this_stage, settings=settings, subsampling_size=None)
                if predict_using_threshold:
                    y_pred_of_stages[stage] = mf.predict_using_threshold(anomaly_scores=scores_training, threshold=0.5)
                else:
                    y_pred_of_stages[stage], which_cluster_is_anomaly, kmeans = mf.predict_using_kmeans(anomaly_scores=scores_training)
                scores, _ = mf.get_anomaly_scores(test_data=np.c_[xx.ravel(), yy.ravel()], settings=settings, subsampling_size=None)
                if predict_using_threshold:
                    Z = mf.predict_using_threshold(anomaly_scores=scores, threshold=0.5)
                else:
                    Z = mf.predict_outOfSample_using_kmeans(anomaly_scores=scores, which_cluster_is_anomaly=which_cluster_is_anomaly, kmeans=kmeans)
                Z = Z.reshape(xx.shape)
                scores = scores.reshape(xx.shape)
                scores_of_stages[stage] = scores
                Z_of_stages[stage] = Z
                X_in_stages[stage] = X_all_in_this_stage
        save_variable(variable=y_pred_of_stages, name_of_variable="y_pred_of_stages", path_to_save=path_save)
        save_variable(variable=Z_of_stages, name_of_variable="Z_of_stages", path_to_save=path_save)
        save_variable(variable=scores_of_stages, name_of_variable="scores_of_stages", path_to_save=path_save)
        save_variable(variable=X_in_stages, name_of_variable="X_in_stages", path_to_save=path_save)
    else:
        y_pred_of_stages = load_variable(name_of_variable="y_pred_of_stages", path=path_save)
        Z_of_stages = load_variable(name_of_variable="Z_of_stages", path=path_save)
        scores_of_stages = load_variable(name_of_variable="scores_of_stages", path=path_save)
        X_in_stages = load_variable(name_of_variable="X_in_stages", path=path_save)
    # ------ legends:
    # # colors = np.array(['#377eb8', '#ff7f00']) #--> https://htmlcolorcodes.com/
    # colors = np.array(['#BBFF33', '#ff7f00'])
    # markers = np.array(['^', 'o'])
    # plt.scatter(0, 0, color=colors[1], marker=markers[1], edgecolors="k")
    # plt.scatter(1, 1, color=colors[0], marker=markers[0], edgecolors="k")
    # plt.legend(["normal", "anomaly"])
    # plt.show()
    for stage in range(n_online_stages):
        y_pred = y_pred_of_stages[stage]
        scores = scores_of_stages[stage]
        Z = Z_of_stages[stage]
        X = X_in_stages[stage]
        # ------ plot the predicted anomaly for the space:
        # plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black')
        # plt.imshow(scores, cmap='hot', interpolation='nearest')
        plt.imshow(Z * -1, cmap='gray', alpha=0.2)
        # plt.colorbar()
        colors = np.array(['#BBFF33', '#ff7f00'])
        markers = np.array(['^', 'o'])
        # plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred + 1) // 2], marker="o")
        colors_vector = colors[(y_pred + 1) // 2]
        markers_vector = markers[(y_pred + 1) // 2]
        for _s, c, _x, _y in zip(markers_vector, colors_vector, X[:, 0], X[:, 1]):
            _x = (_x + 7) * (150 / 14)
            _y = (_y + 7) * (150 / 14)
            plt.scatter(_x, _y, marker=_s, c=c, alpha=1, edgecolors="k")
        plt.xlim(0, 150)
        plt.ylim(0, 150)
        # plt.xlim(-7, 7)
        # plt.ylim(-7, 7)
        plt.xticks(())
        plt.yticks(())
        plt.show()
        # ------ plot the anomaly score for the space:
        plt.imshow(scores, cmap='gray')
        if stage == 0:
            a = np.min(scores)
            b = np.max(scores)
        plt.clim(a, b)
        plt.colorbar()
        # plt.xlim(-7, 7)
        # plt.ylim(-7, 7)
        # plt.show()
        colors = np.array(['#BBFF33', '#ff7f00'])
        markers = np.array(['^', 'o'])
        # plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred + 1) // 2], marker="o")
        colors_vector = colors[(y_pred + 1) // 2]
        markers_vector = markers[(y_pred + 1) // 2]
        for _s, c, _x, _y in zip(markers_vector, colors_vector, X[:, 0], X[:, 1]):
            _x = (_x + 7) * (150 / 14)
            _y = (_y + 7) * (150 / 14)
            plt.scatter(_x, _y, marker=_s, c=c, alpha=0.5, edgecolors="k")
        plt.xlim(0, 150)
        plt.ylim(0, 150)
        # plt.xlim(-7, 7)
        # plt.ylim(-7, 7)
        plt.xticks(())
        plt.yticks(())
        plt.show()
Ejemplo n.º 2
0
def anomaly_visual_experiment_bach(dataset, method, generate_synthetic_datasets_again, predict_anomaly_again, predict_using_threshold):
    # https://scikit-learn.org/stable/auto_examples/plot_anomaly_comparison.html#sphx-glr-auto-examples-plot-anomaly-comparison-py
    # settings:
    n_samples = 300
    outliers_fraction = 0.15
    n_outliers = int(outliers_fraction * n_samples)
    n_inliers = n_samples - n_outliers
    xx, yy = np.meshgrid(np.linspace(-7, 7, 150), np.linspace(-7, 7, 150))
    rng = np.random.RandomState(42)
    # dataset:
    path_dataset = './datasets/' + dataset + "/"
    if generate_synthetic_datasets_again:
        if dataset == "two_moons":
            X = 4. * (make_moons(n_samples=n_inliers, noise=.05, random_state=0)[0] - np.array([0.5, 0.25]))
        elif dataset == "one_blob":
            X = make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5)[0]
        elif dataset == "two_blobs":
            X = make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5])[0]
        elif dataset == "two_different_blobs":
            X = make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3])[0]
        save_variable(variable=X, name_of_variable="X", path_to_save=path_dataset)
    else:
        X = load_variable(name_of_variable="X", path=path_dataset)
    # Add outliers:
    X = np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))], axis=0)
    # anomaly detection algorithm:
    if method == "iMondrian_forest":
        if predict_using_threshold:
            path_save = "./saved_files/batch/" + method + "/threshold/" + dataset + "/"
        else:
            path_save = "./saved_files/batch/" + method + "/Kmeans/" + dataset + "/"
    else:
        path_save = "./saved_files/batch/" + method + "/" + dataset + "/"
    if predict_anomaly_again:
        if method == "iso_forest":
            clf = IsolationForest(contamination=outliers_fraction, random_state=42, behaviour='old')
            clf.fit(X)
            y_pred = clf.predict(X)
            print(y_pred)
            Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
            Z = Z.reshape(xx.shape)
            scores = clf.decision_function(X=np.c_[xx.ravel(), yy.ravel()])
            scores = scores.reshape(xx.shape)
            scores = scores - 0.5  #--> adding back the self.offset_ which is -0.5
            scores = scores * -1  #--> multipyling by -1 again
        elif method == "iMondrian_forest":
            settings, data, param, cache, train_ids_current_minibatch = MondrianForest.prepare_training_data(X=X, num_trees=100)
            mf = MondrianForest(settings, data)
            subsampling_size = 256
            mf.fit(data, train_ids_current_minibatch, settings, param, cache, subsampling_size=None)
            scores_training, _ = mf.get_anomaly_scores(test_data=X, settings=settings, subsampling_size=None)
            if predict_using_threshold:
                y_pred = mf.predict_using_threshold(anomaly_scores=scores_training, threshold=0.5)
            else:
                y_pred, which_cluster_is_anomaly, kmeans = mf.predict_using_kmeans(anomaly_scores=scores_training)
            scores, _ = mf.get_anomaly_scores(test_data=np.c_[xx.ravel(), yy.ravel()], settings=settings, subsampling_size=None)
            if predict_using_threshold:
                Z = mf.predict_using_threshold(anomaly_scores=scores, threshold=0.5)
            else:
                Z = mf.predict_outOfSample_using_kmeans(anomaly_scores=scores, which_cluster_is_anomaly=which_cluster_is_anomaly, kmeans=kmeans)
            Z = Z.reshape(xx.shape)
            scores = scores.reshape(xx.shape)
        save_variable(variable=y_pred, name_of_variable="y_pred", path_to_save=path_save)
        save_variable(variable=Z, name_of_variable="Z", path_to_save=path_save)
        save_variable(variable=scores, name_of_variable="scores", path_to_save=path_save)
    else:
        y_pred = load_variable(name_of_variable="y_pred", path=path_save)
        Z = load_variable(name_of_variable="Z", path=path_save)
        scores = load_variable(name_of_variable="scores", path=path_save)
    # ------ legends:
    # # colors = np.array(['#377eb8', '#ff7f00']) #--> https://htmlcolorcodes.com/
    # colors = np.array(['#BBFF33', '#ff7f00'])
    # markers = np.array(['^', 'o'])
    # plt.scatter(0, 0, color=colors[1], marker=markers[1], edgecolors="k")
    # plt.scatter(1, 1, color=colors[0], marker=markers[0], edgecolors="k")
    # plt.legend(["normal", "anomaly"])
    # plt.show()
    # ------ plot the predicted anomaly for the space:
    # plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black')
    # plt.imshow(scores, cmap='hot', interpolation='nearest')
    plt.imshow(Z * -1, cmap='gray', alpha=0.2)
    # plt.colorbar()
    colors = np.array(['#BBFF33', '#ff7f00'])
    markers = np.array(['^', 'o'])
    # plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred + 1) // 2], marker="o")
    colors_vector = colors[(y_pred + 1) // 2]
    markers_vector = markers[(y_pred + 1) // 2]
    for _s, c, _x, _y in zip(markers_vector, colors_vector, X[:, 0], X[:, 1]):
        _x = (_x + 7) * (150 / 14)
        _y = (_y + 7) * (150 / 14)
        plt.scatter(_x, _y, marker=_s, c=c, alpha=1, edgecolors="k")
    plt.xlim(0, 150)
    plt.ylim(0, 150)
    # plt.xlim(-7, 7)
    # plt.ylim(-7, 7)
    plt.xticks(())
    plt.yticks(())
    plt.show()
    # ------ plot the anomaly score for the space:
    plt.imshow(scores, cmap='gray')
    # plt.clim(0, 1)
    plt.colorbar()
    # plt.xlim(-7, 7)
    # plt.ylim(-7, 7)
    # plt.show()
    colors = np.array(['#BBFF33', '#ff7f00'])
    markers = np.array(['^', 'o'])
    # plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred + 1) // 2], marker="o")
    colors_vector = colors[(y_pred + 1) // 2]
    markers_vector = markers[(y_pred + 1) // 2]
    for _s, c, _x, _y in zip(markers_vector, colors_vector, X[:, 0], X[:, 1]):
        _x = (_x + 7) * (150 / 14)
        _y = (_y + 7) * (150 / 14)
        plt.scatter(_x, _y, marker=_s, c=c, alpha=0.5, edgecolors="k")
    plt.xlim(0, 150)
    plt.ylim(0, 150)
    # plt.xlim(-7, 7)
    # plt.ylim(-7, 7)
    plt.xticks(())
    plt.yticks(())
    plt.show()