Beispiel #1
0
def knn(X_train, y_train=None, X_test=None, y_test=None):
    # train kNN detector
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores
    # # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    #
    # # evaluate and print the results
    # print("\nOn Training Data:")
    # evaluate_print(clf_name, y_train, y_train_scores)
    # print("\nOn Test Data:")
    # evaluate_print(clf_name, y_test, y_test_scores)
    #
    # visualize the results
    visualize(clf_name,
              X_train,
              X_test,
              y_train_pred,
              y_test_pred,
              show_figure=True,
              save_figure=False)

    return y_train_pred, y_train_scores
Beispiel #2
0
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    # example of the feature importance
    feature_importance = clf.feature_importances_
    print("Feature importance", feature_importance)

    # visualize the results
    visualize(clf_name,
              X_train,
              y_train,
              X_test,
              y_test,
              y_train_pred,
              y_test_pred,
              show_figure=True,
              save_figure=False)
Beispiel #3
0
    x_train_pca = kpca.fit_transform(x_train)
    clf = KNN(n_neighbors=5, contamination=contam)
    x_train_pca = standardizer(x_train_pca)
    clf.fit(x_train_pca)
    y_pred_pca = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_scores = clf.decision_scores_  # raw outlier scores
    evaluation(y, y_scores, 'PCA+KNN')
    all_scores['PCA+KNN'] = y_scores

    evaluation(y, all_scores, algorithms)

    visualize(clf_name,
              x_train_pca,
              y,
              x_train_pca,
              y,
              y_pred_pca,
              y_pred_pca,
              show_figure=True,
              save_figure=False)

    visual_ano = [[], []]
    visual_nor = [[], []]
    for i in range(0, len(x_train_pca), 10):
        if y[i] == 1:
            visual_ano[0].append(x_train_pca[i][0])
            visual_ano[1].append(x_train_pca[i][1])
        else:
            visual_nor[0].append(x_train_pca[i][0])
            visual_nor[1].append(x_train_pca[i][1])
    plt.scatter(visual_ano[0], visual_ano[1], c='r')
Beispiel #4
0
def pyod_anomaly_detection(type, contamination):
    X_train, y_train, X_test, y_test = data(type=type,
                                            contamination=contamination)
    if type == 'MAD':
        # train MAD detector
        clf_name = 'MAD'
        clf = MAD(threshold=3.5)
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores
        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)

        # visualize the results
        # making dimensions = 2 for visualising purpose only. By repeating same data each dimension.
        visualize(clf_name,
                  np.hstack((X_train, X_train)),
                  y_train,
                  np.hstack((X_test, X_test)),
                  y_test,
                  y_train_pred,
                  y_test_pred,
                  show_figure=True,
                  save_figure=False)
    elif type == 'ABOD':
        # train ABOD detector
        clf_name = 'ABOD'
        clf = ABOD()
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)

        # visualize the results
        visualize(clf_name,
                  X_train,
                  y_train,
                  X_test,
                  y_test,
                  y_train_pred,
                  y_test_pred,
                  show_figure=True,
                  save_figure=False)
    elif type == 'AutoEncoder':
        # train AutoEncoder detector
        clf_name = 'AutoEncoder'
        clf = AutoEncoder(epochs=30, contamination=contamination)
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)
Beispiel #5
0
                            n_iter=2000).fit_transform(x_train)
X_test_reduced_tsne = TSNE(n_components=2,
                           random_state=2020,
                           init='pca',
                           n_iter=2000).fit_transform(x_test)

# In[14]:

import warnings
warnings.filterwarnings("ignore")
from pyod.utils.example import visualize
visualize('KNN',
          X_train_reduced_tsne,
          y_train,
          X_test_reduced_tsne,
          y_test,
          y_train_pred,
          y_test_pred,
          show_figure=True,
          save_figure=False)

# ### 用roc和prn评价准确率

# In[21]:

from sklearn.metrics import roc_auc_score
from pyod.utils.utility import precision_n_scores

# In[22]:

train_roc = np.round(roc_auc_score(y_train, y_train_pred), decimals=4)
Beispiel #6
0
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=1,
                      contamination=contamination,
                      random_state=42)

    # train MAD detector
    clf_name = 'MAD'
    clf = MAD(threshold=3.5)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    # visualize the results
    # making dimensions = 2 for visualising purpose only. By repeating same data each dimension.
    visualize(clf_name, np.hstack((X_train, X_train)), y_train, np.hstack((X_test, X_test)), y_test, y_train_pred,
              y_test_pred, show_figure=True, save_figure=False)
Beispiel #7
0
read = r"D:\研一下学期\数据挖掘\作业4\pageb\meta_data\pageb.preproc.csv"
data = pd.read_csv(read, header=0, index_col=0)
train_x = data.drop(drop + ground_truth + ["original.label"], axis=1).values
train_y = np.array(
    [transfor[x] for x in list(_flatten(data[ground_truth].values.tolist()))])
clf.fit(train_x)
label = clf.labels_
predict = clf.decision_scores_
evaluate_print(clf_name, train_y, predict)
pca = decomposition.PCA(n_components=2)
X = pca.fit_transform(train_x)
visualize(clf_name,
          X,
          train_y,
          X,
          train_y,
          label,
          train_y,
          show_figure=True,
          save_figure=True)

clf = MCD()
clf_name = "PCA"
read = r"D:\研一下学期\数据挖掘\作业4\abalone\meta_data\abalone.preproc.csv"
data = pd.read_csv(read, header=0, index_col=0)
train_x = data.drop(drop + ground_truth + ["original.label"], axis=1).values
train_y = np.array(
    [transfor[x] for x in list(_flatten(data[ground_truth].values.tolist()))])
clf.fit(train_x)
label = clf.labels_
predict = clf.decision_scores_