Example #1
0
def anomaly_detection_AUC_experiment_batch(anomaly_method, dataset, X_train_in_folds, X_test_in_folds, y_train_in_folds, y_test_in_folds):
    rng = np.random.RandomState(42)
    n_folds = len(X_train_in_folds)
    auc_test_array = np.zeros((n_folds,))
    auc_train_array = np.zeros((n_folds,))
    time_of_algorithm_test = np.zeros((n_folds,))
    time_of_algorithm_train = np.zeros((n_folds,))
    for fold_index in range(n_folds):
        X_train = X_train_in_folds[fold_index]
        X_test = X_test_in_folds[fold_index]
        y_train = y_train_in_folds[fold_index]
        y_test = y_test_in_folds[fold_index]
        if fold_index == 0:
            y = list(y_train)
            y.extend(y_test)
            y = np.asarray(y)
            # print(y)
            percentage_of_anomalies = sum(y == -1) / len(y)
            print("percentage of the anomalies = " + str(percentage_of_anomalies))
        if anomaly_method == "iso_forest":
            clf = IsolationForest(random_state=rng)
            start = time.time()
            clf.fit(X=X_train)
            scores_train = clf.decision_function(X=X_train)
            end = time.time()
            time_of_algorithm_train[fold_index] = end - start
            start = time.time()
            scores_test = clf.decision_function(X=X_test)
            end = time.time()
            time_of_algorithm_test[fold_index] = end - start
        elif anomaly_method == "one_class_SVM":
            clf = OneClassSVM(gamma='auto')
            start = time.time()
            clf.fit(X=X_train)
            scores_train = clf.decision_function(X=X_train)
            end = time.time()
            time_of_algorithm_train[fold_index] = end - start
            start = time.time()
            scores_test = clf.decision_function(X=X_test)
            end = time.time()
            time_of_algorithm_test[fold_index] = end - start
        elif anomaly_method == "LOF":
            n_neighbors = 10
            clf = LOF(n_neighbors=n_neighbors, contamination=0.1)
            start = time.time()
            clf.fit(X=X_train)
            scores_train = clf.negative_outlier_factor_
            end = time.time()
            time_of_algorithm_train[fold_index] = end - start
            clf = LOF(n_neighbors=n_neighbors, novelty=True, contamination=0.1)
            start = time.time()
            clf.fit(X=X_train)
            scores_test = clf.decision_function(X=X_test)
            end = time.time()
            time_of_algorithm_test[fold_index] = end - start
        elif anomaly_method == "covariance_estimator":
            clf = EllipticEnvelope(random_state=rng)
            start = time.time()
            clf.fit(X=X_train)
            scores_train = clf.decision_function(X=X_train)
            end = time.time()
            time_of_algorithm_train[fold_index] = end - start
            start = time.time()
            scores_test = clf.decision_function(X=X_test)
            end = time.time()
            time_of_algorithm_test[fold_index] = end - start
        elif anomaly_method == "iMondrian_forest":
            settings, data, param, cache, train_ids_current_minibatch = MondrianForest.prepare_training_data(X=X_train, num_trees=100)
            clf = MondrianForest(settings, data)
            subsampling_size = 256
            start = time.time()
            # clf.fit(data, train_ids_current_minibatch, settings, param, cache, subsampling_size=None)
            clf.fit(data, train_ids_current_minibatch, settings, param, cache, subsampling_size=subsampling_size)
            scores, scores_shifted = clf.get_anomaly_scores(test_data=X_train, settings=settings, subsampling_size=None)
            scores_train = scores_shifted
            end = time.time()
            time_of_algorithm_train[fold_index] = end - start
            start = time.time()
            scores, scores_shifted = clf.get_anomaly_scores(test_data=X_test, settings=settings, subsampling_size=None)
            scores_test = scores_shifted
            end = time.time()
            time_of_algorithm_test[fold_index] = end - start
        # scores_test = -1 * scores_test  #--> to have: the more score, the less anomaly
        fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, scores_test, pos_label=1) #--> https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
        fpr_train, tpr_train, thresholds_train = metrics.roc_curve(y_train, scores_train, pos_label=1)
        # plt.plot(fpr_test, tpr_test)
        # plt.show()
        # plt.plot(fpr_train, tpr_train)
        # plt.show()
        auc_test = metrics.auc(fpr_test, tpr_test)  #--> https://scikit-learn.org/stable/modules/generated/sklearn.metrics.auc.html
        print("Fold: " + str(fold_index) + " ---> AUC for test: " + str(auc_test))
        auc_test_array[fold_index] = auc_test
        auc_train = metrics.auc(fpr_train, tpr_train)
        print("Fold: " + str(fold_index) + " ---> AUC for train: " + str(auc_train))
        auc_train_array[fold_index] = auc_train
    auc_test_mean = auc_test_array.mean()
    auc_test_std = auc_test_array.std()
    auc_train_mean = auc_train_array.mean()
    auc_train_std = auc_train_array.std()
    time_of_algorithm_train_mean = time_of_algorithm_train.mean()
    time_of_algorithm_train_std = time_of_algorithm_train.std()
    time_of_algorithm_test_mean = time_of_algorithm_test.mean()
    time_of_algorithm_test_std = time_of_algorithm_test.std()
    print("Average AUC for test data: " + str(auc_test_mean) + " +- " + str(auc_test_std))
    print("Average time for test data: " + str(time_of_algorithm_test_mean) + " +- " + str(time_of_algorithm_test_std))
    print("Average AUC for train data: " + str(auc_train_mean) + " +- " + str(auc_train_std))
    print("Average time for train data: " + str(time_of_algorithm_train_mean) + " +- " + str(time_of_algorithm_train_std))
    if anomaly_method == "LOF" or anomaly_method == "CAD":
        path = './output/batch/' + dataset + "/" + anomaly_method + "/neigh=" + str(n_neighbors) + "/"
    else:
        path = './output/batch/' + dataset + "/" + anomaly_method + "/"
    save_np_array_to_txt(variable=auc_test_array, name_of_variable="auc_test_array", path_to_save=path)
    save_np_array_to_txt(variable=auc_test_mean, name_of_variable="auc_test_mean", path_to_save=path)
    save_np_array_to_txt(variable=auc_test_std, name_of_variable="auc_test_std", path_to_save=path)
    save_np_array_to_txt(variable=auc_train_array, name_of_variable="auc_train_array", path_to_save=path)
    save_np_array_to_txt(variable=auc_train_mean, name_of_variable="auc_train_mean", path_to_save=path)
    save_np_array_to_txt(variable=auc_train_std, name_of_variable="auc_train_std", path_to_save=path)
    save_np_array_to_txt(variable=time_of_algorithm_test, name_of_variable="time_of_algorithm_test", path_to_save=path)
    save_np_array_to_txt(variable=time_of_algorithm_test_mean, name_of_variable="time_of_algorithm_test_mean", path_to_save=path)
    save_np_array_to_txt(variable=time_of_algorithm_test_std, name_of_variable="time_of_algorithm_test_std", path_to_save=path)
    save_np_array_to_txt(variable=time_of_algorithm_train, name_of_variable="time_of_algorithm_train", path_to_save=path)
    save_np_array_to_txt(variable=time_of_algorithm_train_mean, name_of_variable="time_of_algorithm_train_mean", path_to_save=path)
    save_np_array_to_txt(variable=time_of_algorithm_train_std, name_of_variable="time_of_algorithm_train_std", path_to_save=path)
    save_np_array_to_txt(variable=percentage_of_anomalies, name_of_variable="percentage_of_anomalies", path_to_save=path)