def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test): if method == 'KNN': clf = KNN() elif method == 'CBLOF': clf = CBLOF() elif method == 'PCA': clf = PCA() else: clf = IForest() clf.fit(x_train) # 使用x_train训练检测器clf # 返回训练数据x_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) print("On train Data:") evaluate_print(method, y_train, y_train_scores) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(x_test) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(x_test) # 返回未知数据上的异常值 (分值越大越异常) print("On Test Data:") evaluate_print(method, y_test, y_test_scores) y_true = column_or_1d(y_test) y_pred = column_or_1d(y_test_scores) check_consistent_length(y_true, y_pred) roc = np.round(roc_auc_score(y_true, y_pred), decimals=4), prn = np.round(precision_n_scores(y_true, y_pred), decimals=4) total_roc.append(roc) total_prn.append(prn)
def model_test(model_type, y_train, y_test, X_train, X_test, model_file, save_flag): if model_type == 'KNN': clf_name = 'KNN' clf = KNN() clf.fit(X_train) if model_type == 'XGBOD': clf_name = 'XGBOD' #set this scale_pos_weight sum(negative instances) / sum(positive instances). clf = XGBOD(random_state=42, scale_pos_weight=50) clf.fit(X_train, y_train) if model_type == 'SOD': # train SOD detector # Note that SOD is meant to work in high dimensions d > 2. # But here we are using 2D for visualization purpose # thus, higher precision is expected in higher dimensions clf_name = 'SOD' clf = SOD() clf.fit(X_train) if model_type == 'VAE': # train VAE detector (Beta-VAE) clf_name = 'VAE' contamination = 0.01 clf = VAE(epochs=30, contamination=contamination, gamma=0.8, capacity=0.2) clf.fit(X_train) #save model if specified if save_flag == '1': pickle.dump(clf, open(model_file, "wb")) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) conf_train = confusion_matrix(y_train, y_train_pred) print("<<<< confusion matrix for train: ", conf_train) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) conf_test = confusion_matrix(y_test, y_test_pred) print("<<<< confusion matrix for test: ", conf_test) # visualize the results #todo: Input data has to be 2-d for visualization. #visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, # y_test_pred, show_figure=True, save_figure=False) return model_file
def main(): dataset, label = pre_data() from numpy import nan as NA from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=NA, strategy="mean") dataset = imputer.fit_transform(dataset) x_train, x_test, y_train, y_label = train_test_split(dataset, label, test_size=0.3, random_state=44) # x_train, x_test, y_train, y_label =[], [], [], [] # for i in range(1000): # x_train.append(dataset[i]) # y_train.append(label[i]) # for i in range(6000,10000): # x_train.append(dataset[i]) # y_train.append(label[i]) # x_test = dataset[1000:6000] # y_label = label[1000:6000] for i in range(3): clf_name = 'IForest' clf = IForest() clf.fit(x_train) # get the prediction label and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores from sklearn.metrics import accuracy_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score print(accuracy_score(y_train, y_train_pred)) print(precision_score(y_train, y_train_pred)) print(recall_score(y_train, y_train_pred)) # get the prediction on the test data y_test_pred = clf.predict(x_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(x_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print(accuracy_score(y_label, y_test_pred)) print(precision_score(y_train, y_train_pred)) print(recall_score(y_train, y_train_pred)) print("\nOn Test Data:") evaluate_print(clf_name, y_label, y_test_scores)
def train(doc_list, dataset_name, clf_name): model_roc = [] model_prc = [] if clf_name == "PCA": clf = PCA() elif clf_name == "MCD": clf = MCD() elif clf_name == "LOF": clf = LOF() elif clf_name == "KNN": clf = KNN() elif clf_name == "LODA": clf = LODA() for i in range(10): data = pd.read_csv(doc_list[i], header=0, index_col=0) train_x = data.drop(drop + ground_truth, axis=1).values train_y = np.array([ transfor[x] for x in list(_flatten(data[ground_truth].values.tolist())) ]) clf.fit(train_x) predict = clf.decision_scores_ roc = roc_auc_score(train_y, predict) prc = precision_n_scores(train_y, predict) if ((i + 1) % 200 == 0): print("第" + str(i + 1) + "个文件结果:") evaluate_print(clf_name, train_y, predict) model_roc.append(roc) model_prc.append(prc) model_roc_avg = np.mean(model_roc) model_prc_avg = np.mean(model_prc) print("模型" + clf_name + "在数据集" + dataset_name + "的平均roc_auc为" + str(round(model_roc_avg, 4)) + ",平均prc为" + str(round(model_prc_avg, 4)) + "。") return model_roc_avg, model_prc_avg
def pyod_anomaly_detection(type, contamination): X_train, y_train, X_test, y_test = data(type=type, contamination=contamination) if type == 'MAD': # train MAD detector clf_name = 'MAD' clf = MAD(threshold=3.5) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results # making dimensions = 2 for visualising purpose only. By repeating same data each dimension. visualize(clf_name, np.hstack((X_train, X_train)), y_train, np.hstack((X_test, X_test)), y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) elif type == 'ABOD': # train ABOD detector clf_name = 'ABOD' clf = ABOD() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) elif type == 'AutoEncoder': # train AutoEncoder detector clf_name = 'AutoEncoder' clf = AutoEncoder(epochs=30, contamination=contamination) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
detectors = [KNN(), LOF(), OCSVM()] clf_name = 'LSCP' clf = LSCP(base_estimators=detectors) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print('Average', y_train, y_train_scores) print("\nOn Test Data:") evaluate_print('Average', y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False)
print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf)) for i in range(n_clf): k = k_list[i] clf = KNN(n_neighbors=k, method='largest') clf.fit(X_train_norm) train_scores[:, i] = clf.decision_scores_ test_scores[:, i] = clf.decision_function(X_test_norm) # Decision scores have to be normalized before combination train_scores_norm, test_scores_norm = standardizer(train_scores, test_scores) # Combination by average y_by_average = average(test_scores_norm) evaluate_print('Combination by Average', y_test, y_by_average) # Combination by max y_by_maximization = maximization(test_scores_norm) evaluate_print('Combination by Maximization', y_test, y_by_maximization) # Combination by aom y_by_aom = aom(test_scores_norm, n_buckets=5) evaluate_print('Combination by AOM', y_test, y_by_aom) # Combination by moa y_by_moa = moa(test_scores_norm, n_buckets=5) evaluate_print('Combination by MOA', y_test, y_by_moa)
columns=["Dataset", "Dimensions", "PCA", "MCD", "LOF", "KNN", "LODA"]) result_roc result_prc #对全集csv文件进行训练并可视化结果 clf = PCA() clf_name = "PCA" read = r"D:\研一下学期\数据挖掘\作业4\pageb\meta_data\pageb.preproc.csv" data = pd.read_csv(read, header=0, index_col=0) train_x = data.drop(drop + ground_truth + ["original.label"], axis=1).values train_y = np.array( [transfor[x] for x in list(_flatten(data[ground_truth].values.tolist()))]) clf.fit(train_x) label = clf.labels_ predict = clf.decision_scores_ evaluate_print(clf_name, train_y, predict) pca = decomposition.PCA(n_components=2) X = pca.fit_transform(train_x) visualize(clf_name, X, train_y, X, train_y, label, train_y, show_figure=True, save_figure=True) clf = MCD() clf_name = "PCA" read = r"D:\研一下学期\数据挖掘\作业4\abalone\meta_data\abalone.preproc.csv"
X_test, n_estimators, # rp_flags[starts[i]:starts[i + 1]], jl_transformers, approx_flags[starts[i]:starts[i + 1]], verbose=True) for i in range(n_jobs)) print('Orig decision_function time:', time.time() - start) print() # unfold and generate the label matrix predicted_scores_orig = np.zeros([X_test.shape[0], n_estimators]) for i in range(n_jobs): predicted_scores_orig[:, starts[i]:starts[i + 1]] = np.asarray( all_results_scores[i]).T ########################################################################## predicted_scores = standardizer(predicted_scores) predicted_scores_orig = standardizer(predicted_scores_orig) evaluate_print('orig', y_test, average(predicted_scores_orig)) evaluate_print('new', y_test, average(predicted_scores)) evaluate_print('orig max', y_test, maximization(predicted_scores_orig)) evaluate_print('new max', y_test, maximization(predicted_scores)) evaluate_print('orig aom', y_test, aom(predicted_scores_orig)) evaluate_print('new aom', y_test, aom(predicted_scores)) evaluate_print('orig moa', y_test, moa(predicted_scores_orig)) evaluate_print('new moa', y_test, moa(predicted_scores))
n_train = 20000 # number of training points n_test = 2000 # number of testing points n_features = 300 # number of features # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=n_features, contamination=contamination, random_state=42) # train AutoEncoder detector clf_name = 'AutoEncoder' clf = AutoEncoder(epochs=30, contamination=contamination) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
def fun(dir_path): file_list = [] total_roc = [] total_prn = [] count = 0 for home, dirs, files in os.walk("./"+dir_path+"/benchmarks"): for filename in files: fullname = os.path.join(home, filename) file_list.append(fullname)cb for file_csv in file_list: # if count == 2: # break df = pd.read_csv(file_csv) columns = df.columns # df = df[columns].fillna('nan') data = df.drop(columns = ['point.id', 'motherset', 'origin']) class_mapping = {"anomaly":1, "nominal":0} data['ground.truth'] = data['ground.truth'].map(class_mapping) class_mapping = {"anomaly":1, "nominal":0} y = data['ground.truth'] x = data.drop('ground.truth',axis=1) X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=28) random_state = np.random.RandomState(42) outliers_fraction = 0.05 # Define seven outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state), 'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean',contamination=outliers_fraction) } p_prn = [] p_roc = [] for i, (clf_name, clf) in enumerate(classifiers.items()): try: clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print(str(count)+"is analysing") print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) roc=np.round(roc_auc_score(y_train, y_train_scores), decimals=4), prn=np.round(precision_n_scores(y_test, y_test_scores), decimals=4) p_prn.append(prn) p_roc.append(roc[0]) except: p_prn.append(-1) p_roc.append(-1) total_prn.append(p_prn) total_roc.append(p_roc) count += 1 total_prn = json.dumps(total_prn) total_roc = json.dumps(total_roc) a = open(dir_path+"_prn_list.txt", "w",encoding='UTF-8') a.write(total_prn) a.close() a = open(dir_path+"_roc_list.txt", "w",encoding='UTF-8') a.write(total_roc) a.close()
verbose=True) for i in range(n_jobs)) print('Orig decision_function time:', time.time() - start) print() # unfold and generate the label matrix predicted_scores_orig = np.zeros([X.shape[0], n_estimators]) for i in range(n_jobs): predicted_scores_orig[:, starts[i]:starts[i + 1]] = np.asarray( all_results_scores[i]).T ########################################################################## predicted_scores = standardizer(predicted_scores) predicted_scores_orig = standardizer(predicted_scores_orig) evaluate_print('orig', y_test, np.mean(predicted_scores_orig, axis=1)) evaluate_print('new', y_test, np.mean(predicted_scores, axis=1)) #%% ########################################################################## start = time.time() for i in range(n_estimators): print(i) trained_estimators[i].predict(X) print('Orig decision_function time:', time.time() - start) print() ########################################################################## start = time.time()
X_train_add = np.zeros([X_train.shape[0], len(estimator_list)]) X_test_add = np.zeros([X_test.shape[0], len(estimator_list)]) # fit the model for index, estimator in enumerate(estimator_list): if normalization_list[index]: estimator.fit(X_train_norm) X_train_add[:, index] = estimator.decision_scores_ X_test_add[:, index] = estimator.decision_function(X_test_norm) else: estimator.fit(X_train) X_train_add[:, index] = estimator.decision_scores_ X_test_add[:, index] = estimator.decision_function(X_test) # prepare the new feature space X_train_new = np.concatenate((X_train, X_train_add), axis=1) X_test_new = np.concatenate((X_test, X_test_add), axis=1) clf = XGBClassifier() clf.fit(X_train_new, y_train) y_test_scores = clf.predict_proba(X_test_new) # outlier scores evaluate_print('XGBOD', y_test, y_test_scores[:, 1]) clf = XGBClassifier() clf.fit(X_train, y_train) y_test_scores_orig = clf.predict_proba(X_test) # outlier scores evaluate_print('old', y_test, y_test_scores_orig[:, 1])
from pyod.models.iforest import IForest from pyod.utils.data import generate_data from pyod.utils.data import evaluate_print import numpy as np import pickle X_train = np.loadtxt('X_train.txt', dtype=float) y_train = np.loadtxt('y_train.txt', dtype=float) X_test = np.loadtxt('X_test.txt', dtype=float) y_test = np.loadtxt('y_test.txt', dtype=float) clf = IForest() clf.fit(X_train) y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores print(y_test_pred) print("\nOn Test Data:") evaluate_print('IForest', y_test[:len(y_test_scores)], y_test_scores) pickle.dump(clf, open("IForest.p", "wb"))
print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() for t in range(ite): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) # initialize 20 base detectors for combination clf = PCA() clf.fit(X_train_norm) train_scores = clf.decision_scores_ test_scores = clf.decision_function(X_test_norm) print() evaluate_print('PCA Train', y_train, train_scores) evaluate_print('PCA Test', y_test, test_scores)
def test_evaluate_print(self): X_train, y_train, X_test, y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) evaluate_print('dummy', y_train, y_train * 0.1)
from pyod.models.iforest import IForest from pyod.utils.data import generate_data from pyod.utils.data import evaluate_print import numpy as np import pickle X_train = np.loadtxt('X_train.txt', dtype=float) y_train = np.loadtxt('y_train.txt', dtype=float) X_test = np.loadtxt('X_test.txt', dtype=float) y_test = np.loadtxt('y_test.txt', dtype=float) clf = IForest() clf.fit(X_train) # get the prediction labels and outlier scores of the training data #y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) #y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores print(y_test_pred) # evaluate and print the results print("\nOn Test Data:") evaluate_print('IForest', y_test, y_test_scores) pickle.dump(clf, open("IForest.p", "wb"))
train_test_split(X, y, test_size=0.4, random_state=42) contamination = y.sum() / len(y) base_estimators = get_estimators_small(contamination) model = SUOD(base_estimators=base_estimators, n_jobs=6, bps_flag=True, contamination=contamination, approx_flag_global=True) model.fit(X_train) # fit all models with X model.approximate(X_train) # conduct model approximation if it is enabled predicted_labels = model.predict(X_test) # predict labels predicted_scores = model.decision_function(X_test) # predict scores predicted_probs = model.predict_proba(X_test) # predict scores ########################################################################### # compared with other approaches evaluate_print('majority vote', y_test, majority_vote(predicted_labels)) evaluate_print('average', y_test, average(predicted_scores)) evaluate_print('maximization', y_test, maximization(predicted_scores)) clf = LOF() clf.fit(X_train) evaluate_print('LOF', y_test, clf.decision_function(X_test)) clf = IForest() clf.fit(X_train) evaluate_print('IForest', y_test, clf.decision_function(X_test))
# model prediction all_results_scores = Parallel( n_jobs=n_jobs, max_nbytes=None, verbose=True)(delayed(_parallel_decision_function)( n_estimators_list[i], trained_estimators[starts[i]:starts[i + 1]], None, X_test, n_estimators, jl_transformers, approx_flags[starts[i]:starts[i + 1]], verbose=True) for i in range(n_jobs)) print('Orig decision_function time:', time.time() - start) print() # unfold and generate the label matrix predicted_scores_orig = np.zeros([X_test.shape[0], n_estimators]) for i in range(n_jobs): predicted_scores_orig[:, starts[i]:starts[i + 1]] = np.asarray( all_results_scores[i]).T ########################################################################## predicted_scores = standardizer(predicted_scores) predicted_scores_orig = standardizer(predicted_scores_orig) evaluate_print('orig', y_test, average(predicted_scores_orig)) evaluate_print('new', y_test, average(predicted_scores)) evaluate_print('orig moa', y_test, moa(predicted_scores_orig)) evaluate_print('new moa', y_test, moa(predicted_scores))
train_scores = np.zeros([X_train.shape[0], n_clf]) test_scores = np.zeros([X_test.shape[0], n_clf]) for i in range(n_clf): k = k_list[i] clf = KNN(n_neighbors=k, method='largest') clf.fit(X_train_norm) train_scores[:, i] = clf.decision_scores_ test_scores[:, i] = clf.decision_function(X_test_norm) # decision scores have to be normalized before combination train_scores_norm, test_scores_norm = standardizer(train_scores, test_scores) # combination by average y_by_average = average(test_scores_norm) evaluate_print('Combination by Average', y_test, y_by_average) # combination by max y_by_maximization = maximization(test_scores_norm) evaluate_print('Combination by Maximization', y_test, y_by_maximization) # combination by aom y_by_aom = aom(test_scores_norm, n_buckets=5) evaluate_print('Combination by AOM', y_test, y_by_aom) # combination by moa y_by_moa = moa(test_scores_norm, n_buckets=5) evaluate_print('Combination by MOA', y_test, y_by_moa)
from pyod.models.auto_encoder import AutoEncoder from pyod.utils.data import generate_data from pyod.utils.data import evaluate_print import numpy as np import pickle if __name__ == '__main__': X_train = np.loadtxt('X_train.txt', dtype=float) y_train = np.loadtxt('y_train.txt', dtype=float) X_test = np.loadtxt('X_test.txt', dtype=float) y_test = np.loadtxt('y_test.txt', dtype=float) clf = AutoEncoder(epochs=30, contamination=0.2) clf.fit(X_train) # get the prediction labels and outlier scores of the training data #y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) #y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores print(y_test_pred) # evaluate and print the results print("\nOn Test Data:") evaluate_print('AutoEncoder', y_test, y_test_scores) pickle.dump(clf, open("autoencoder.p", "wb"))
# train IForest detector clf_name = 'IForest' clf = IForest() clf.fit(X_train) # get the prediction label and decision_scores_ on the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False)
X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) contamination = y.sum() / len(y) base_estimators = get_estimators_small(contamination) model = SUOD(base_estimators=base_estimators, n_jobs=6, bps_flag=True, contamination=contamination, approx_flag_global=True) model.fit(X_train) # fit all models with X model.approximate(X_train) # conduct model approximation if it is enabled # save the model dump(model, 'model.joblib') # load the model model = load('model.joblib') predicted_labels = model.predict(X_test) # predict labels predicted_scores = model.decision_function(X_test) # predict scores predicted_probs = model.predict_proba(X_test) # predict scores ########################################################################### # model evaluation with the loaded model evaluate_print('majority vote', y_test, majority_vote(predicted_labels)) evaluate_print('average', y_test, average(predicted_scores)) evaluate_print('maximization', y_test, maximization(predicted_scores))