def knn(X_train, y_train=None, X_test=None, y_test=None): # train kNN detector clf_name = 'KNN' clf = KNN() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # # # evaluate and print the results # print("\nOn Training Data:") # evaluate_print(clf_name, y_train, y_train_scores) # print("\nOn Test Data:") # evaluate_print(clf_name, y_test, y_test_scores) # # visualize the results visualize(clf_name, X_train, X_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) return y_train_pred, y_train_scores
clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # example of the feature importance feature_importance = clf.feature_importances_ print("Feature importance", feature_importance) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False)
x_train_pca = kpca.fit_transform(x_train) clf = KNN(n_neighbors=5, contamination=contam) x_train_pca = standardizer(x_train_pca) clf.fit(x_train_pca) y_pred_pca = clf.labels_ # binary labels (0: inliers, 1: outliers) y_scores = clf.decision_scores_ # raw outlier scores evaluation(y, y_scores, 'PCA+KNN') all_scores['PCA+KNN'] = y_scores evaluation(y, all_scores, algorithms) visualize(clf_name, x_train_pca, y, x_train_pca, y, y_pred_pca, y_pred_pca, show_figure=True, save_figure=False) visual_ano = [[], []] visual_nor = [[], []] for i in range(0, len(x_train_pca), 10): if y[i] == 1: visual_ano[0].append(x_train_pca[i][0]) visual_ano[1].append(x_train_pca[i][1]) else: visual_nor[0].append(x_train_pca[i][0]) visual_nor[1].append(x_train_pca[i][1]) plt.scatter(visual_ano[0], visual_ano[1], c='r')
def pyod_anomaly_detection(type, contamination): X_train, y_train, X_test, y_test = data(type=type, contamination=contamination) if type == 'MAD': # train MAD detector clf_name = 'MAD' clf = MAD(threshold=3.5) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results # making dimensions = 2 for visualising purpose only. By repeating same data each dimension. visualize(clf_name, np.hstack((X_train, X_train)), y_train, np.hstack((X_test, X_test)), y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) elif type == 'ABOD': # train ABOD detector clf_name = 'ABOD' clf = ABOD() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) elif type == 'AutoEncoder': # train AutoEncoder detector clf_name = 'AutoEncoder' clf = AutoEncoder(epochs=30, contamination=contamination) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
n_iter=2000).fit_transform(x_train) X_test_reduced_tsne = TSNE(n_components=2, random_state=2020, init='pca', n_iter=2000).fit_transform(x_test) # In[14]: import warnings warnings.filterwarnings("ignore") from pyod.utils.example import visualize visualize('KNN', X_train_reduced_tsne, y_train, X_test_reduced_tsne, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) # ### 用roc和prn评价准确率 # In[21]: from sklearn.metrics import roc_auc_score from pyod.utils.utility import precision_n_scores # In[22]: train_roc = np.round(roc_auc_score(y_train, y_train_pred), decimals=4)
X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=1, contamination=contamination, random_state=42) # train MAD detector clf_name = 'MAD' clf = MAD(threshold=3.5) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results # making dimensions = 2 for visualising purpose only. By repeating same data each dimension. visualize(clf_name, np.hstack((X_train, X_train)), y_train, np.hstack((X_test, X_test)), y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False)
read = r"D:\研一下学期\数据挖掘\作业4\pageb\meta_data\pageb.preproc.csv" data = pd.read_csv(read, header=0, index_col=0) train_x = data.drop(drop + ground_truth + ["original.label"], axis=1).values train_y = np.array( [transfor[x] for x in list(_flatten(data[ground_truth].values.tolist()))]) clf.fit(train_x) label = clf.labels_ predict = clf.decision_scores_ evaluate_print(clf_name, train_y, predict) pca = decomposition.PCA(n_components=2) X = pca.fit_transform(train_x) visualize(clf_name, X, train_y, X, train_y, label, train_y, show_figure=True, save_figure=True) clf = MCD() clf_name = "PCA" read = r"D:\研一下学期\数据挖掘\作业4\abalone\meta_data\abalone.preproc.csv" data = pd.read_csv(read, header=0, index_col=0) train_x = data.drop(drop + ground_truth + ["original.label"], axis=1).values train_y = np.array( [transfor[x] for x in list(_flatten(data[ground_truth].values.tolist()))]) clf.fit(train_x) label = clf.labels_ predict = clf.decision_scores_