def detectWithAutoencoder(self): ''' Apply the Autoencoder Detection Method. ''' # Find Model Hyperparameters hpMap = self.config['AnomalyDetector']['AutoencoderHyperparameters'] # Create and Fit the Autoencoder Model AE = AutoEncoder(hidden_neurons=[1 for i in range(hpMap['depth'])]) AE.fit(self.y.reshape(-1, 1)) # Get & Plot Anomaly Scores for the Observations anomalyScores = AE.decision_scores_ self.plotAnomalyScores(anomalyScores) # Report the Lon/Lat Points Corresponding to the Anomalies # in the Order of Decreasing Anomaly Score (i.e., the Most # Anomalous Points are Shown First) anomalyIdxList = [self.y[i] for i in range(self.y.shape[0]) \ if anomalyScores[i] >= hpMap['anomalyScoreCutoff']] anomalyLonLatMap = {anomalyScores[i]: (self.M['longitude'][idx], self.M['latitude'][idx]) \ for idx in anomalyIdxList} sortedScores = sorted(anomalyScores) anomaliesLonLatSorted = [anomalyLonLatMap[sortedScores[i]] for i in range(len(sortedScores)) \ if sortedScores[i] in anomalyLonLatMap] return anomaliesLonLatSorted
def training(data, img_shape, re_sample_type, text_len, permission_names, extract_f): # load training data print('preparing training data') inputs, permissions = prepare_training_data(data, img_shape, re_sample_type, text_len, permission_names) # get features print('generating training features') features = extract_f.predict(inputs) # train auto encoder model, knn model print('training outlier model + knn model') detectors = [] knn_trees = [] features_in_permissions = [ ] # features in each permission, [permission_id, feature_id] for p in permission_names: print('training', p, '...') features_current = [] for i in range(len(permissions)): if p in permissions[i]: features_current.append(features[i]) features_in_permissions.append(features_current) detector = AutoEncoder(epochs=200, verbose=0) detector.fit(features_current) detectors.append(detector) knn = KNN() knn.fit(features_current) knn_trees.append(knn) return detectors, knn_trees, features_in_permissions
def anomaly_detection(data, label): X = data[data.select_dtypes('number').columns.tolist()] y = data[label] y = y.values X = X.drop([label], axis=1) sc = StandardScaler() X = pd.DataFrame(data=sc.fit_transform(X), columns=X.columns) ifo = IForest(contamination=0.01, behaviour='new', n_estimators=1000, max_samples=1024, n_jobs=-1, verbose=1) ifo.fit(X) ifo_pred = ifo.labels_ print('ROC score for Isolation forest: ', roc_auc_score(y, ifo_pred)) utilities.plot_outlier_scores( y, ifo.decision_scores_, bw=0.1, title='Fraud, Isolation forest. (n_estimators={})'.format( ifo.n_estimators)) ae = AutoEncoder(hidden_neurons=[25, 20, 15, 20, 25], hidden_activation='relu', output_activation='sigmoid', optimizer='adam', epochs=20, batch_size=128, dropout_rate=0.2, l2_regularizer=0.0, validation_size=0.1, preprocessing=False, verbose=1, random_state=1, contamination=0.01) ae.fit(X) ae_pred = ae.labels_ print('ROC score for Autoencoder: ', roc_auc_score(y, ae_pred)) utilities.plot_outlier_scores( y, ae.decision_scores_, bw=0.1, title='Fraud, Autoencoder. (epochs={})'.format(ae.epochs)) # Too long to train, under-sample needed lof = LOF(n_neighbors=int(y.sum() * 1.3), contamination=0.01, n_jobs=-1) lof.fit(X) lof_pred = lof.labels_ print('ROC score for LOF: ', roc_auc_score(y, lof_pred)) utilities.plot_outlier_scores( y, lof.decision_scores_, bw=0.1, title='Fraud, Local outliers factor. (n_neighbors={})'.format( lof.n_neighbors)) return y, ifo_pred, ae_pred, lof_pred
def autoencoder_outlier_detection(X_train, X_test, **kwargs): detector = AutoEncoder(**kwargs) detector.fit(X_train) prob = detector.predict_proba(X_test)[:, -1] if isinstance(X_test, pd.DataFrame): return pd.Series(prob, name='outlier', index=X_test.index) return pd.Series(prob, name='outlier')
def aeAD(self, hidden_neurons, epochs): # train AutoEncoder detector clf_name = 'AutoEncoder' clf = AutoEncoder(hidden_neurons = hidden_neurons, epochs=epochs) clf.fit(self.X) # get the prediction labels and outlier scores of the training data y_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_scores = clf.decision_scores_ # raw outlier scores generateAnomalis(self.data, self.label, y_pred)
def __init__(self, hidden_neurons, nu, epochs, batch_size=32, output_activation='sigmoid'): self.model = AutoEncoder(hidden_neurons=hidden_neurons, contamination=nu, epochs=epochs, batch_size=batch_size, validation_size=0, output_activation=output_activation)
def getOutlierAutoEncoder(dataset): ''' @brief Function that executes AutoEncoder algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model with 3 layers of neurons and 8, 6, 8 neurons per layer without verbose ae = AutoEncoder(hidden_neurons=[8, 6, 8], verbose=0) # Fits the data and obtains labels ae.fit(dataset) # Return labels return ae.labels_
def detect_outliers(lst): clf = AutoEncoder(verbose=1) clf.fit(lst) inliers = [] for index, data in enumerate(lst): y = clf.predict(data.reshape(1,-1)) if y: # y==1 for outliers logger.warning('Found outlier: {0}'.format(index)) else: inliers.append(data) logger.info('{:.0%} are outliers'.format(1 - len(inliers) / len(lst))) return inliers
def setUp(self): self.n_train = 6000 self.n_test = 1000 self.n_features = 300 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = AutoEncoder(epochs=5, contamination=self.contamination) self.clf.fit(self.X_train)
def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def main(): plt.close('all') matplotlib.use('Qt5Agg') # override PyCharm pro's scientific view create_links() warnings.showwarning = silence_warnings contamination = 0.1 # percentage of outliers n_train = 500 # number of training points n_test = 500 # number of testing points n_features = 25 # Number of features X_test, y_test, X_train, y_train = _generate_random_data( contamination, n_features, n_test, n_train) # X_test, y_test, X_train, y_train = ? _plot_using_pca(X_train, y_train) hidden_neurons = [25, 2, 2, 25] clf1 = AutoEncoder(hidden_neurons=hidden_neurons) clf1.fit(X_train) y_train_scores = clf1.decision_scores_ # Predict the anomaly scores y_test_scores = clf1.decision_function(X_test) # outlier scores y_test_scores = pd.Series(y_test_scores) # Plot anomaly scores plt.hist(y_test_scores, bins='auto') plt.title("Histogram for Model Clf1 Anomaly Scores") plt.show() manual_score_thres = 4 df_test = X_test.copy() df_test['score'] = y_test_scores # assign cluster=0 to samples with low anomaly score, and cluster=1 to samples with high anomaly score. df_test['cluster'] = np.where(df_test['score'] < manual_score_thres, 0, 1) df_test['cluster'].value_counts() df_test.groupby('cluster').mean() print(df_test)
def remove_outlier_faces(image_paths: list, image_size: int = 160) -> list: faces = [] for image_path, bboxes in zip(image_paths, detect_faces(image_paths)): im = Image.open(image_path) for bbox in bboxes: face = Face(idx=image_path, img=im, bbox=bbox) faces.append(face) clf = AutoEncoder(verbose=1) clf.fit([face.embedding for face in faces]) inliers = [] for face in faces: y = clf.predict(embedding.reshape(1, -1)) if y == 0: face.face_img.save(image_path) inliers.append(embedding) logger.info('{:.0%} are outliers'.format(1 - len(inliers) / len(lst))) return inliers
class AutoEncoder(Detector): def __init__(self, **kwargs): super().__init__() self._model = AutoEncoderPyod(**kwargs) def _fit(self, data): self._model.fit(data) return self def _detect(self, data): return self._model.predict(data) def validate(self, data): if isinstance(data, pd.Series): return data.values.reshape(-1, 1) else: return data def __str__(self): return f"{self.__class__.__name__}({self._model})"
def make_mlo(hub, data, train): ''' Create the Machine Learning Object used for this sequence ''' size = 0 for chunk in data: size = len(chunk) break for chunk in train: size = len(chunk) break hidden_neurons = [size * 2, size, size, size * 2] return AutoEncoder(hidden_neurons=hidden_neurons, contamination=0.001)
def __init__(self, hidden_neurons=[32], hidden_activation='relu', output_activation='sigmoid', loss=mean_squared_error, optimizer='adam', epochs=30, batch_size=10, dropout_rate=0.1, l2_regularizer=0.2, validation_size=0.1, preprocessing=True, verbose=0, random_state=None, contamination=0.1, BoW=None, featurize_confidence = "none", entity_check=False, prev_turn_context=0, input_feature_map=None, slice_vec=[], labels={"none": 0, "error": 1}, one_hot={"none": [0], "error": [1]}): AutoEncoder.__init__(self, hidden_neurons=hidden_neurons, hidden_activation=hidden_activation, output_activation=output_activation, loss=mean_squared_error, optimizer=optimizer, epochs=epochs, batch_size=batch_size, dropout_rate=dropout_rate, l2_regularizer=l2_regularizer, validation_size=validation_size, preprocessing=preprocessing, verbose=verbose, random_state=random_state, contamination=contamination) ErrorClassifier.__init__(self, BoW =BoW, featurize_confidence=featurize_confidence, entity_check=entity_check, prev_turn_context=prev_turn_context, input_feature_map=input_feature_map, slice_vec=slice_vec,labels=labels, one_hot=one_hot)
def use_model(model, df_list, x_columns, params): predicted = [] if model == 'knn': neigh = NearestNeighbors(n_neighbors=params['n'], p=params['p']) neigh.fit(df_list[0][x_columns]) for i in range(len(df_list)): pred = neigh.kneighbors(df_list[i][x_columns]) pred = [np.mean(i) for i in pred[0]] predicted.append(pred) elif model == 'svm': svm = OneClassSVM(kernel=params['kernel']) svm.fit(df_list[0][x_columns]) for i in range(len(df_list)): pred = svm.score_samples(df_list[i][x_columns]) maximum = max(pred) pred = [(x * -1) + maximum for x in pred] predicted.append(pred) elif model == 'ísolationForest': clf = IsolationForest(n_estimators=params['n_estimators'], random_state=0) clf.fit(df_list[0][x_columns]) for i in range(len(df_list)): pred = clf.score_samples(df_list[i][x_columns]) pred = list(map(abs, pred)) predicted.append(pred) elif model == 'autoencoder': clf = AutoEncoder(hidden_neurons=params['hidden_neurons'], verbose=0, random_state=0) clf.fit(df_list[0][x_columns]) for i in range(len(df_list)): pred = clf.decision_function(df_list[i][x_columns]) predicted.append(pred) elif model == 'lsanomaly': anomalymodel = lsanomaly.LSAnomaly(sigma=params['sigma'], rho=params['rho']) anomalymodel.fit(df_list[0][x_columns].to_numpy()) for i in range(len(df_list)): pred = anomalymodel.predict_proba(df_list[i][x_columns].to_numpy()) pred = [a[1] for a in pred] predicted.append(pred) return predicted
class AutoEncoderODD(abstract_occ_model): def __init__(self, hidden_neurons, nu, epochs, batch_size=32, output_activation='sigmoid'): self.model = AutoEncoder(hidden_neurons=hidden_neurons, contamination=nu, epochs=epochs, batch_size=batch_size, validation_size=0, output_activation=output_activation) def fit(self, X): self.model.fit(X) def predict(self, X): prediction = self.model.predict(X) return np.where(prediction == 0.0, 1, np.where(prediction == 1.0, -1, prediction)) def score_samples(self, X): return -self.model.decision_function(X)
def fit(self, X, contamination=0.01): """ Fit detector Args: X: pd.DataFrame """ self.detectors = { "auto_encoder": AutoEncoder( epochs=256, validation_size=0, preprocessing=False, verbose=0, contamination=contamination, ), } # print("train_data.shape:", X.shape) # 数据预处理 # 标准化 X_train_norm, self.data_norm_scalar = standardizer(X, keep_scalar=True) # 归一化 X_train_unif, self.data_unif_scalar = minmaxizer(X_train_norm, keep_scalar=True) train_scores = np.zeros([X.shape[0], len(self.detectors)]) thresholds = np.zeros([1, len(self.detectors)]) # 训练 for i, clf_name in enumerate(self.detectors): clf = self.detectors[clf_name] clf.fit(X_train_unif) train_scores[:, i] = clf.decision_scores_ thresholds[:, i] = clf.threshold_ # 训练集异常程度及阈值 train_scores_norm, self.score_scalar = standardizer(train_scores, keep_scalar=True) thresholds_norm = self.score_scalar.transform(thresholds) self.decision_scores = pd.DataFrame(average(train_scores_norm), index=X.index) self.decision_scores.columns = ["score"] self.threshold = average(thresholds_norm)[0] self.label = self.get_label(self.decision_scores)
def choose_model(model, nnet): """ among implemented in PyOD """ clfs = { 'AE': AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15), 'VAE': VAE(encoder_neurons=nnet[:5], decoder_neurons=nnet[4:], contamination=0.1, epochs=13), 'ABOD': ABOD(), 'FeatureBagging': FeatureBagging(), 'HBOS': HBOS(), 'IForest': IForest(), 'KNN': KNN(), 'LOF': LOF(), 'OCSVM': OCSVM(), 'PCA': PCA(), 'SOS': SOS(), 'COF': COF(), 'CBLOF': CBLOF(), 'SOD': SOD(), 'LOCI': LOCI(), 'MCD': MCD() } return clfs[model]
def _create_model(hidden_neurons=None, hidden_activation='relu', output_activation='sigmoid', optimizer='adam', epochs=100, batch_size=32, dropout_rate=0.2, l2_regularizer=0.1, validation_size=0.1, preprocessing=True, verbose=2, random_state=42, contamination=0.1, loss='binary_crossentropy'): """(Internal helper) Created an Autoencoder instance""" print( "Creating an Autoencoder with\nOutput Activation: {}\nLoss: {}\nOptimizer: {}" .format(output_activation, loss, optimizer)) autoenc = AutoEncoder(hidden_neurons=hidden_neurons, hidden_activation=hidden_activation, output_activation=output_activation, optimizer=optimizer, epochs=int(epochs), batch_size=int(batch_size), dropout_rate=dropout_rate, l2_regularizer=l2_regularizer, validation_size=validation_size, preprocessing=preprocessing, verbose=verbose, random_state=random_state, contamination=contamination) print('Created Model: {}'.format(autoenc)) return autoenc
def ele_outliers(num): dataSetType = ALL_DATA_TYPE[0] trainType = ALL_TRAIN_TYPE[1] X, yc = load_data(dataSetType, trainType, num) # 10 fold validation KF = KFold(n_splits=10, shuffle=True, random_state=10) report_list = [] for train_index, test_index in KF.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = yc[train_index], yc[test_index] # split into train and test # X_train, X_test, y_train, y_test = train_test_split(X, yc, test_size=0.2, random_state=10) # split train to ele and mice X_train_ele = X_train[y_train == 1] X_train_mice = X_train[y_train == 0] # use mice to fit the model mice: 1, ele: -1 # clf = svm.OneClassSVM(nu=nu, kernel='rbf', gamma='scale') # clf = IsolationForest(max_samples=0.2, n_estimators=300, contamination=conta, random_state=rng) # clf.fit(X_train_mice) clf_name = 'AutoEncoder' clf = AutoEncoder(hidden_neurons=[256, 64, 20, 64, 256], epochs=epochs, contamination=conta, random_state=10, verbose=0) clf.fit(X_train_mice) y_pred_test = clf.predict(X_test) # get outlier scores y_pred_scores = clf.decision_function(X_test) c_matrix = confusion_matrix(y_test, y_pred_test) print(c_matrix) temp_report = classification_report(y_test, y_pred_test, output_dict=True) report_list.append(temp_report) print(classification_report(y_test, y_pred_test, output_dict=False)) # evaluate_print(clf_name, y_pred_test, y_pred_scores) final_report = get_avg_report(report_list) print("final report", final_report)
def pyod_anomaly_detection(type, contamination): X_train, y_train, X_test, y_test = data(type=type, contamination=contamination) if type == 'MAD': # train MAD detector clf_name = 'MAD' clf = MAD(threshold=3.5) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results # making dimensions = 2 for visualising purpose only. By repeating same data each dimension. visualize(clf_name, np.hstack((X_train, X_train)), y_train, np.hstack((X_test, X_test)), y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) elif type == 'ABOD': # train ABOD detector clf_name = 'ABOD' clf = ABOD() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) elif type == 'AutoEncoder': # train AutoEncoder detector clf_name = 'AutoEncoder' clf = AutoEncoder(epochs=30, contamination=contamination) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
# 'K Nearest Neighbors (KNN)': KNN( # contamination=outliers_fraction), # 'Average KNN': KNN(method='mean', # contamination=outliers_fraction), # 'Median KNN': KNN(method='median', # contamination=outliers_fraction), # 'Local Outlier Factor (LOF)': # LOF(n_neighbors=35, contamination=outliers_fraction), # 'Minimum Covariance Determinant (MCD) MCD( # contamination=outliers_fraction, random_state=random_state), # 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction, #NOTE: slow, never try again # random_state=random_state), # 'Principal Component Analysis (PCA)': PCA( # contamination=outliers_fraction, random_state=random_state), 'AutoEncoder': AutoEncoder(epochs=2, hidden_neurons=[4, 2, 4], contamination=outliers_fraction), # 'Feature Bagging': # FeatureBagging(LOF(n_neighbors=35), # contamination=outliers_fraction, # check_estimator=False, # random_state=random_state), # 'Angle-based Outlier Detector (ABOD)': # ABOD(n_neighbors=10, # contamination=outliers_fraction), } RunPyodOutlier(classifiers, outlier_save_path, isExtract=True) # RunPyodOutlier(classifiers,outlier_save_path,isExtract=False)
contamination = 0.1 # percentage of outliers n_train = 20000 # number of training points n_test = 2000 # number of testing points n_features = 300 # number of features # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=n_features, contamination=contamination, random_state=42) # train AutoEncoder detector clf_name = 'AutoEncoder' clf = AutoEncoder(epochs=30, contamination=contamination) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
scalar.fit(scada_data) scada_data = scalar.transform(scada_data) contamination_fracs = [0.5, 0.4, 0.3, 0.2, 0.1, 0.08, 0.05, 0.04, 0.03, 0.02, 0.01, 0.008, 0.005, 0.004, 0.003, 0.002, 0.001] all_report_dfs = [] for anomaly_frac in contamination_fracs: print("Running models with {} contamination rate".format(anomaly_frac)) # Instantiate models model_names = ['knn', 'pca', 'cblof', 'iforest', 'autoencoder'] models = [ KNN(contamination=anomaly_frac, algorithm='kd_tree', n_neighbors=13, n_jobs=8), PCA(contamination=anomaly_frac, svd_solver='auto', standardization=False), CBLOF(contamination=anomaly_frac, n_clusters=16, n_jobs=8), IForest(contamination=0.1, n_estimators=100, n_jobs=8, behaviour='new'), AutoEncoder(contamination=anomaly_frac, hidden_neurons=[4, 2, 2, 4], hidden_activation='tanh', batch_size=5000, epochs=200, preprocessing=False, verbose=0) ] X_train, X_test, y_train, y_test = generate_train_test(scada_data, contamination=anomaly_frac, sensor_failure=fail, offset_pct=offset) reports = [train_and_evaluate(model, name, X_train, X_test, y_test, anomaly_frac) for model, name in zip(models, model_names)] reports_df = pd.DataFrame(reports) all_report_dfs.append(reports_df) fulll_report_df = pd.concat(all_report_dfs) print(fulll_report_df) fulll_report_df.to_csv(log_dir + "full_report.csv")
def main(): # PART 1: # Getting the predictions for each classifier # SK means: The classifier is from sklearn or works like sklearn # PY means: The classifier is from pyod or works like pyod models = { 'SK_EE': EllipticEnvelope(), 'SK_GM': GaussianMixture(), 'SK_IF': IsolationForest(), 'SK_OCSVM': OneClassSVM(), 'SK_FA': FactorAnalysis(), 'SK_KD': KernelDensity(), 'PY_PCA': PCA(), 'PY_COF': COF(), 'PY_LODA': LODA(), 'PY_LOF': LOF(), 'PY_HBOS': HBOS(), 'PY_MCD': MCD(), 'PY_AvgKNN': KNN(method='mean'), 'PY_LargestKNN': KNN(method='largest'), 'PY_MedKNN': KNN(method='median'), 'PY_AvgBagging': FeatureBagging(combination='average'), 'PY_MaxBagging': FeatureBagging(combination='max'), 'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'PY_COPOD': COPOD(), 'PY_SOD': SOD(), 'PY_LSCPwithLODA': LSCP([LODA(), LODA()]), 'PY_AveLMDD': LMDD(dis_measure='aad'), 'PY_VarLMDD': LMDD(dis_measure='var'), 'PY_IqrLMDD': LMDD(dis_measure='iqr'), 'PY_VAE': VAE(encoder_neurons=[8, 4, 2]), 'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'SK_BRM': BRM(bootstrap_sample_percent=70), 'SK_OCKRA': m_OCKRA(), 'PY_SoGaal': SO_GAAL(), 'PY_MoGaal': MO_GAAL() } ranker = ADRanker(data="datasets", models=models) ranker.get_predictions() # PART 2: # After predictions, we can evaluate our classifiers using different scores # You can add manually a new metric by modifying 'metrics.py' ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave}) # PART 3: # Finally, it is time to summarize the results by plotting different graphs # You can add your own graphs by modifying ' plots.py' plot = Plots() plot.make_plot_basic(paths=[ 'results/scores/auc/no/results.csv', 'results/scores/auc/minmax/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/ave/minmax/results.csv', 'results/scores/ave/std/results.csv' ], scalers=[ 'Without scaler', 'Min max scaler', 'Standard scaler', 'Without scaler', 'Min max scaler', 'Standard scaler' ]) plot.make_cd_plot( paths=[ 'results/scores/auc/minmax/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/no/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/std/results.csv' ], names=[ 'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale', 'CD ave no scale', 'CD auc std scale', 'CD ave std scale' ], titles=[ 'CD diagram - AUC with min max scaling', 'CD diagram - Average precision with min max scaling', 'CD diagram - AUC without scaling', 'CD diagram - Average precision without scaling', 'CD diagram - AUC with standard scaling', 'CD diagram - Average precision with standard scaling' ])
pyod_umap_big_dim = EvalRun("pyod_umap_big_dim", [doc2vecwikiall], [imdb_20news_3splits], [], [ PyodDetector(HBOS, "HBOS"), PyodDetector(IForest, "iForest"), PyodDetector(LOF, "LOF"), PyodDetector(OCSVM, "OCSVM"), PyodDetector(PCA, "PCA") ]) pyod_autoencoder_test = EvalRun( "pyod_autoencoder_test", [doc2vecwikiall, longformer_large], [imdb_20news_3splits], [NoReduction()], [ PyodDetector(VAE(epochs=30, verbosity=1), "VAE_30"), PyodDetector(VAE(epochs=100, verbosity=1), "VAE_100"), PyodDetector(AutoEncoder(epochs=30, verbose=1), "AE_30"), PyodDetector(AutoEncoder(epochs=100, verbose=2), "AE_100") ]) pyod_autoencer_refined = EvalRun( "pyod_autoencer_refined", [doc2vecwikiall, doc2vecapnews], [imdb_20news_3split_fracs], [], [ PyodDetector( AutoEncoder(hidden_neurons=[32, 16, 16, 32], epochs=30, verbose=1), "AE_30_small"), PyodDetector(AutoEncoder(epochs=10, verbose=1), "AE_10"), PyodDetector(AutoEncoder(epochs=30, verbose=1), "AE_30"), PyodDetector(AutoEncoder(epochs=100, verbose=2), "AE_100") ]) pyod_autoencer_refined_small = EvalRun(
] test_index = [item for item in list(data.index) if item not in train_index] train = data.loc[train_index, df.columns].reset_index(drop=False) test = data.loc[test_index, df.columns].reset_index(drop=False) train = train.apply(pd.to_numeric) test = test.apply(pd.to_numeric) train_x = train.drop(columns=['user_id', 'index']) test_x = test.drop(columns=['user_id', 'index']) np.any(np.isnan(train_x)) np.all(np.isfinite(train_x)) train_norm = StandardScaler().fit_transform(train_x.dropna()) test_norm = StandardScaler().fit_transform(test_x.dropna()) clf1 = AutoEncoder(hidden_neurons=[25, 2, 2, 25]) clf1.fit(train_norm) y_train_scores = clf1.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf1.predict(test_norm) # outlier labels (0 or 1) y_test_scores = clf1.decision_function(test_norm) # outlier scores y_test_pred = pd.Series(y_test_pred) y_test_scores = pd.Series(y_test_scores) y_test_pred.value_counts() y_test_scores.describe()
x_pca = pca.fit_transform(X_train) x_pca = pd.DataFrame(x_pca) x_pca.columns=['PC1','PC2'] # Plot import matplotlib.pyplot as plt plt.scatter(X_train[0], X_train[1], c=y_train, alpha=0.8) plt.title('Scatter plot') plt.xlabel('x') plt.ylabel('y') plt.show() # Step 1: Build the model clf1 = AutoEncoder(hidden_neurons =[25, 2, 2, 25]) clf1.fit(X_train) clf2 = AutoEncoder(hidden_neurons =[25, 10,2, 10, 25]) clf2.fit(X_train) clf3 = AutoEncoder(hidden_neurons =[25, 15, 10, 2, 10,15, 25]) clf3.fit(X_train) # Predict the anomaly scores y_test_scores = clf1.decision_function(X_test) y_test_scores = pd.Series(y_test_scores) # Step 2: Determine the cut point import matplotlib.pyplot as plt plt.hist(y_test_scores, bins='auto')
class TestAutoEncoder(unittest.TestCase): def setUp(self): self.n_train = 6000 self.n_test = 1000 self.n_features = 300 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = AutoEncoder(epochs=5, contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'model_') and self.clf.model_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_prediction_labels_confidence(self): pred_labels, confidence = self.clf.predict(self.X_test, return_confidence=True) assert_equal(pred_labels.shape, self.y_test.shape) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_prediction_proba_linear_confidence(self): pred_proba, confidence = self.clf.predict_proba(self.X_test, method='linear', return_confidence=True) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_model_clone(self): # for deep models this may not apply clone_clf = clone(self.clf) def tearDown(self): pass