def detectWithAutoencoder(self): ''' Apply the Autoencoder Detection Method. ''' # Find Model Hyperparameters hpMap = self.config['AnomalyDetector']['AutoencoderHyperparameters'] # Create and Fit the Autoencoder Model AE = AutoEncoder(hidden_neurons=[1 for i in range(hpMap['depth'])]) AE.fit(self.y.reshape(-1, 1)) # Get & Plot Anomaly Scores for the Observations anomalyScores = AE.decision_scores_ self.plotAnomalyScores(anomalyScores) # Report the Lon/Lat Points Corresponding to the Anomalies # in the Order of Decreasing Anomaly Score (i.e., the Most # Anomalous Points are Shown First) anomalyIdxList = [self.y[i] for i in range(self.y.shape[0]) \ if anomalyScores[i] >= hpMap['anomalyScoreCutoff']] anomalyLonLatMap = {anomalyScores[i]: (self.M['longitude'][idx], self.M['latitude'][idx]) \ for idx in anomalyIdxList} sortedScores = sorted(anomalyScores) anomaliesLonLatSorted = [anomalyLonLatMap[sortedScores[i]] for i in range(len(sortedScores)) \ if sortedScores[i] in anomalyLonLatMap] return anomaliesLonLatSorted
def training(data, img_shape, re_sample_type, text_len, permission_names, extract_f): # load training data print('preparing training data') inputs, permissions = prepare_training_data(data, img_shape, re_sample_type, text_len, permission_names) # get features print('generating training features') features = extract_f.predict(inputs) # train auto encoder model, knn model print('training outlier model + knn model') detectors = [] knn_trees = [] features_in_permissions = [ ] # features in each permission, [permission_id, feature_id] for p in permission_names: print('training', p, '...') features_current = [] for i in range(len(permissions)): if p in permissions[i]: features_current.append(features[i]) features_in_permissions.append(features_current) detector = AutoEncoder(epochs=200, verbose=0) detector.fit(features_current) detectors.append(detector) knn = KNN() knn.fit(features_current) knn_trees.append(knn) return detectors, knn_trees, features_in_permissions
def anomaly_detection(data, label): X = data[data.select_dtypes('number').columns.tolist()] y = data[label] y = y.values X = X.drop([label], axis=1) sc = StandardScaler() X = pd.DataFrame(data=sc.fit_transform(X), columns=X.columns) ifo = IForest(contamination=0.01, behaviour='new', n_estimators=1000, max_samples=1024, n_jobs=-1, verbose=1) ifo.fit(X) ifo_pred = ifo.labels_ print('ROC score for Isolation forest: ', roc_auc_score(y, ifo_pred)) utilities.plot_outlier_scores( y, ifo.decision_scores_, bw=0.1, title='Fraud, Isolation forest. (n_estimators={})'.format( ifo.n_estimators)) ae = AutoEncoder(hidden_neurons=[25, 20, 15, 20, 25], hidden_activation='relu', output_activation='sigmoid', optimizer='adam', epochs=20, batch_size=128, dropout_rate=0.2, l2_regularizer=0.0, validation_size=0.1, preprocessing=False, verbose=1, random_state=1, contamination=0.01) ae.fit(X) ae_pred = ae.labels_ print('ROC score for Autoencoder: ', roc_auc_score(y, ae_pred)) utilities.plot_outlier_scores( y, ae.decision_scores_, bw=0.1, title='Fraud, Autoencoder. (epochs={})'.format(ae.epochs)) # Too long to train, under-sample needed lof = LOF(n_neighbors=int(y.sum() * 1.3), contamination=0.01, n_jobs=-1) lof.fit(X) lof_pred = lof.labels_ print('ROC score for LOF: ', roc_auc_score(y, lof_pred)) utilities.plot_outlier_scores( y, lof.decision_scores_, bw=0.1, title='Fraud, Local outliers factor. (n_neighbors={})'.format( lof.n_neighbors)) return y, ifo_pred, ae_pred, lof_pred
def autoencoder_outlier_detection(X_train, X_test, **kwargs): detector = AutoEncoder(**kwargs) detector.fit(X_train) prob = detector.predict_proba(X_test)[:, -1] if isinstance(X_test, pd.DataFrame): return pd.Series(prob, name='outlier', index=X_test.index) return pd.Series(prob, name='outlier')
def aeAD(self, hidden_neurons, epochs): # train AutoEncoder detector clf_name = 'AutoEncoder' clf = AutoEncoder(hidden_neurons = hidden_neurons, epochs=epochs) clf.fit(self.X) # get the prediction labels and outlier scores of the training data y_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_scores = clf.decision_scores_ # raw outlier scores generateAnomalis(self.data, self.label, y_pred)
def getOutlierAutoEncoder(dataset): ''' @brief Function that executes AutoEncoder algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model with 3 layers of neurons and 8, 6, 8 neurons per layer without verbose ae = AutoEncoder(hidden_neurons=[8, 6, 8], verbose=0) # Fits the data and obtains labels ae.fit(dataset) # Return labels return ae.labels_
def detect_outliers(lst): clf = AutoEncoder(verbose=1) clf.fit(lst) inliers = [] for index, data in enumerate(lst): y = clf.predict(data.reshape(1,-1)) if y: # y==1 for outliers logger.warning('Found outlier: {0}'.format(index)) else: inliers.append(data) logger.info('{:.0%} are outliers'.format(1 - len(inliers) / len(lst))) return inliers
def main(): plt.close('all') matplotlib.use('Qt5Agg') # override PyCharm pro's scientific view create_links() warnings.showwarning = silence_warnings contamination = 0.1 # percentage of outliers n_train = 500 # number of training points n_test = 500 # number of testing points n_features = 25 # Number of features X_test, y_test, X_train, y_train = _generate_random_data( contamination, n_features, n_test, n_train) # X_test, y_test, X_train, y_train = ? _plot_using_pca(X_train, y_train) hidden_neurons = [25, 2, 2, 25] clf1 = AutoEncoder(hidden_neurons=hidden_neurons) clf1.fit(X_train) y_train_scores = clf1.decision_scores_ # Predict the anomaly scores y_test_scores = clf1.decision_function(X_test) # outlier scores y_test_scores = pd.Series(y_test_scores) # Plot anomaly scores plt.hist(y_test_scores, bins='auto') plt.title("Histogram for Model Clf1 Anomaly Scores") plt.show() manual_score_thres = 4 df_test = X_test.copy() df_test['score'] = y_test_scores # assign cluster=0 to samples with low anomaly score, and cluster=1 to samples with high anomaly score. df_test['cluster'] = np.where(df_test['score'] < manual_score_thres, 0, 1) df_test['cluster'].value_counts() df_test.groupby('cluster').mean() print(df_test)
class AutoEncoder(Detector): def __init__(self, **kwargs): super().__init__() self._model = AutoEncoderPyod(**kwargs) def _fit(self, data): self._model.fit(data) return self def _detect(self, data): return self._model.predict(data) def validate(self, data): if isinstance(data, pd.Series): return data.values.reshape(-1, 1) else: return data def __str__(self): return f"{self.__class__.__name__}({self._model})"
def remove_outlier_faces(image_paths: list, image_size: int = 160) -> list: faces = [] for image_path, bboxes in zip(image_paths, detect_faces(image_paths)): im = Image.open(image_path) for bbox in bboxes: face = Face(idx=image_path, img=im, bbox=bbox) faces.append(face) clf = AutoEncoder(verbose=1) clf.fit([face.embedding for face in faces]) inliers = [] for face in faces: y = clf.predict(embedding.reshape(1, -1)) if y == 0: face.face_img.save(image_path) inliers.append(embedding) logger.info('{:.0%} are outliers'.format(1 - len(inliers) / len(lst))) return inliers
def ele_outliers(num): dataSetType = ALL_DATA_TYPE[0] trainType = ALL_TRAIN_TYPE[1] X, yc = load_data(dataSetType, trainType, num) # 10 fold validation KF = KFold(n_splits=10, shuffle=True, random_state=10) report_list = [] for train_index, test_index in KF.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = yc[train_index], yc[test_index] # split into train and test # X_train, X_test, y_train, y_test = train_test_split(X, yc, test_size=0.2, random_state=10) # split train to ele and mice X_train_ele = X_train[y_train == 1] X_train_mice = X_train[y_train == 0] # use mice to fit the model mice: 1, ele: -1 # clf = svm.OneClassSVM(nu=nu, kernel='rbf', gamma='scale') # clf = IsolationForest(max_samples=0.2, n_estimators=300, contamination=conta, random_state=rng) # clf.fit(X_train_mice) clf_name = 'AutoEncoder' clf = AutoEncoder(hidden_neurons=[256, 64, 20, 64, 256], epochs=epochs, contamination=conta, random_state=10, verbose=0) clf.fit(X_train_mice) y_pred_test = clf.predict(X_test) # get outlier scores y_pred_scores = clf.decision_function(X_test) c_matrix = confusion_matrix(y_test, y_pred_test) print(c_matrix) temp_report = classification_report(y_test, y_pred_test, output_dict=True) report_list.append(temp_report) print(classification_report(y_test, y_pred_test, output_dict=False)) # evaluate_print(clf_name, y_pred_test, y_pred_scores) final_report = get_avg_report(report_list) print("final report", final_report)
class AutoEncoderODD(abstract_occ_model): def __init__(self, hidden_neurons, nu, epochs, batch_size=32, output_activation='sigmoid'): self.model = AutoEncoder(hidden_neurons=hidden_neurons, contamination=nu, epochs=epochs, batch_size=batch_size, validation_size=0, output_activation=output_activation) def fit(self, X): self.model.fit(X) def predict(self, X): prediction = self.model.predict(X) return np.where(prediction == 0.0, 1, np.where(prediction == 1.0, -1, prediction)) def score_samples(self, X): return -self.model.decision_function(X)
from sklearn.manifold import TSNE tsne = TSNE(n_components=2) # Reduce the redunant data X24 = tsne.fit_transform(unique4) plt.figure(figsize=(20, 20)) plt.scatter(X24[:, 0], X24[:, 1], c=iforest4.labels_) plt.show() # In[29]: from pyod.models.auto_encoder import AutoEncoder autoencoder2 = AutoEncoder(hidden_neurons=[16, 8, 8, 16]) autoencoder2.fit(unique2) # In[30]: from sklearn.manifold import TSNE tsne = TSNE(n_components=2) # Reduce the redunant data X42 = tsne.fit_transform(unique2) plt.figure(figsize=(20, 20)) plt.scatter(X42[:, 0], X42[:, 1], c=autoencoder2.labels_) plt.show() # In[1]: autoencoder2 = AutoEncoder(hidden_neurons=[16, 8, 8, 16])
test_index = [item for item in list(data.index) if item not in train_index] train = data.loc[train_index, df.columns].reset_index(drop=False) test = data.loc[test_index, df.columns].reset_index(drop=False) train = train.apply(pd.to_numeric) test = test.apply(pd.to_numeric) train_x = train.drop(columns=['user_id', 'index']) test_x = test.drop(columns=['user_id', 'index']) np.any(np.isnan(train_x)) np.all(np.isfinite(train_x)) train_norm = StandardScaler().fit_transform(train_x.dropna()) test_norm = StandardScaler().fit_transform(test_x.dropna()) clf1 = AutoEncoder(hidden_neurons=[25, 2, 2, 25]) clf1.fit(train_norm) y_train_scores = clf1.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf1.predict(test_norm) # outlier labels (0 or 1) y_test_scores = clf1.decision_function(test_norm) # outlier scores y_test_pred = pd.Series(y_test_pred) y_test_scores = pd.Series(y_test_scores) y_test_pred.value_counts() y_test_scores.describe()
n_train = 20000 # number of training points n_test = 2000 # number of testing points n_features = 300 # number of features # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=n_features, contamination=contamination, random_state=42) # train AutoEncoder detector clf_name = 'AutoEncoder' clf = AutoEncoder(epochs=30, contamination=contamination) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
x_pca = pd.DataFrame(x_pca) x_pca.columns=['PC1','PC2'] # Plot import matplotlib.pyplot as plt plt.scatter(X_train[0], X_train[1], c=y_train, alpha=0.8) plt.title('Scatter plot') plt.xlabel('x') plt.ylabel('y') plt.show() # Step 1: Build the model clf1 = AutoEncoder(hidden_neurons =[25, 2, 2, 25]) clf1.fit(X_train) clf2 = AutoEncoder(hidden_neurons =[25, 10,2, 10, 25]) clf2.fit(X_train) clf3 = AutoEncoder(hidden_neurons =[25, 15, 10, 2, 10,15, 25]) clf3.fit(X_train) # Predict the anomaly scores y_test_scores = clf1.decision_function(X_test) y_test_scores = pd.Series(y_test_scores) # Step 2: Determine the cut point import matplotlib.pyplot as plt plt.hist(y_test_scores, bins='auto') plt.title("Histogram with Model Clf3 Anomaly Scores")
class TestAutoEncoder(unittest.TestCase): def setUp(self): self.n_train = 6000 self.n_test = 1000 self.n_features = 300 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = AutoEncoder(epochs=5, contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'model_') and self.clf.model_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_prediction_labels_confidence(self): pred_labels, confidence = self.clf.predict(self.X_test, return_confidence=True) assert_equal(pred_labels.shape, self.y_test.shape) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_prediction_proba_linear_confidence(self): pred_proba, confidence = self.clf.predict_proba(self.X_test, method='linear', return_confidence=True) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_model_clone(self): # for deep models this may not apply clone_clf = clone(self.clf) def tearDown(self): pass
# df.fillna(0) # df.to_excel('nnViewDataTest.xlsx') # print(df) nnData = pd.read_excel("nnViewData.xlsx") nnData = nnData.drop(['date_time'], axis=1) dropCleanScale = StandardScaler().fit_transform(nnData) dropCleanScale = pd.DataFrame(dropCleanScale) nnDataTest = pd.read_excel("nnViewDataTest.xlsx") nnDataTest = nnDataTest.drop(['date_time'], axis=1) dropCleanScaleTest = StandardScaler().fit_transform(nnDataTest) dropCleanScaleTest = pd.DataFrame(dropCleanScaleTest) clf1 = AutoEncoder(hidden_neurons=[14, 2, 2, 14]) clf1.fit(dropCleanScale) y_train_scores1 = clf1.decision_scores_ clf2 = AutoEncoder(hidden_neurons=[14, 10, 2, 10, 14]) clf2.fit(dropCleanScale) y_train_scores2 = clf2.decision_scores_ y_test1 = clf1.decision_function(dropCleanScaleTest) y_test2 = clf2.decision_function(dropCleanScaleTest) ## plotting the Remaining lifetime score plt.hist(y_test1, bins='auto', color='green') plt.hist(y_test2, bins='auto', color='blue') plt.title("Histogram for Model Clf1 Anomaly Scores") plt.show()
def ele_outliers(num): # num = 10 # fileName1 = "/data/sym/one-class-svm/data/mean_of_five/dec-feature/caida-A-50W-5-{}.csv".format(num) # fileName2 = "/data/sym/one-class-svm/data/mean_of_five/bin-feature/caida-A-50W-5-{}.csv".format(num) # fileName1 = "/data/sym/one-class-svm/data/mean_of_five/dec-feature/univ1-50W-{0}-{1}.csv".format(5, num) # fileName2 = "/data/sym/one-class-svm/data/mean_of_five/bin-feature/univ1-50W-{0}-{1}.csv".format(5, num) fileName1 = "data/dec-test.csv" fileName2 = "data/bin-test.csv" df = pd.read_csv(fileName1) dfb = pd.read_csv(fileName2) #conver to matrix X = dfb.values X[X == '0'] = -1 X[X == '1'] = 1 yr = df['flowSize'] # thres = int(sys.argv[1]) yc = yr.copy(deep=True) yc[yr <= thres] = 0 yc[yr > thres] = 1 print("original mice count: ", sum(yc == 0)) print("original elephant count: ", sum(yc == 1)) # 10 fold validation KF = KFold(n_splits=10, shuffle=True, random_state=10) report_list = [] for train_index, test_index in KF.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = yc[train_index], yc[test_index] # split into train and test # X_train, X_test, y_train, y_test = train_test_split(X, yc, test_size=0.2, random_state=10) # split train to ele and mice X_train_ele = X_train[y_train == 1] X_train_mice = X_train[y_train == 0] # use mice to fit the model mice: 1, ele: -1 # clf = svm.OneClassSVM(nu=nu, kernel='rbf', gamma='scale') # clf = IsolationForest(max_samples=0.2, n_estimators=300, contamination=conta, random_state=rng) # clf.fit(X_train_mice) clf_name = 'AutoEncoder' clf = AutoEncoder(hidden_neurons=[256, 64, 20, 64, 256], epochs=epochs, contamination=conta, random_state=10, verbose=0) clf.fit(X_train_mice) y_pred_test = clf.predict(X_test) # get outlier scores y_pred_scores = clf.decision_function(X_test) c_matrix = confusion_matrix(y_test, y_pred_test) print(c_matrix) temp_report = classification_report(y_test, y_pred_test, output_dict=True) report_list.append(temp_report) print(classification_report(y_test, y_pred_test, output_dict=False)) # evaluate_print(clf_name, y_pred_test, y_pred_scores) final_report = get_avg_report(report_list) print("final report", final_report)
for i in range(0, len(pred_1)): if pred_1[i] == 0: out_1.append('Normal') else: out_1.append('Abnormal') state_1 = pd.DataFrame(out_1, columns = ['Condition']) state_1 = state_1.loc[state_1['Condition'] == 'Abnormal'] ab_state1 = list(state_1.index.values.tolist()) #Deep Learning model using pyod library clf_2 = AutoEncoder(hidden_neurons = [15, 64, 32, 64, 15], epochs = 350, batch_size = 128, preprocessing = False, verbose = 0, random_state = 1234, contamination = 0.1, validation_size = 0.3) clf_2.fit(X_train) pred_2 = clf_2.predict(X_train) #output of the DL model out_2 = [] for i in range(0, len(pred_2)): if pred_2[i] == 0: out_2.append('Normal') else: out_2.append('Abnormal') state_2 = pd.DataFrame(out_2, columns = ['Condition']) state_2 = state_2.loc[state_2['Condition'] == 'Abnormal'] ab_state2 = list(state_2.index.values.tolist()) #plotting results for three ranges of sensor value and two detectors mentioned above
class TestAutoEncoder(unittest.TestCase): def setUp(self): self.n_train = 6000 self.n_test = 1000 self.n_features = 300 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = AutoEncoder(epochs=5, contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): # TODO: fix estimator check for AutoEncoder # check_estimator(self.clf) pass def test_parameters(self): assert_true( hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true( hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true( hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true( hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true( hasattr(self.clf, 'model_') and self.clf.model_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def tearDown(self): pass
IF = IsolationForest(random_state=r) IF.fit(X_train) sklearn_score_anomalies = IF.decision_function(X_test) original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies] aucs_if_ws[r] = evaluate.AUC(original_paper_score, y_test) auc_if_ws = np.mean(aucs_if_ws) # --- T2 --- # y_pred_proba_hot = Hoteliing_SPC_proba(X_train, X_test) auc_hot_ws = evaluate.AUC(y_pred_proba_hot, y_test) # --- AutoEncoder --- # aucs_ae_ws = np.zeros(num_of_experiments) for r in range(num_of_experiments): AE = AutoEncoder(hidden_neurons=[64, 6, 6, 64], random_state=r) AE.fit(X_train) ae_pred_proba = AE.predict_proba(X_test)[:, 1] aucs_ae_ws[r] = evaluate.AUC(ae_pred_proba, y_test) auc_ae_ws = np.mean(aucs_ae_ws) # --- one-class-SVM --- # clf = svm.OneClassSVM(kernel="rbf") clf.fit(X_train) sklearn_score_anomalies = clf.decision_function(X_test) original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies] auc_svm_ws = evaluate.AUC(original_paper_score, y_test) # --- LOF --- # lof = LocalOutlierFactor(novelty=True) lof.fit(X_train) sklearn_score_anomalies = lof.decision_function(X_test)
from pyod.models.auto_encoder import AutoEncoder from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.model_selection import train_test_split table_name = 'grow_data_0a05p06e' sql = f"""SELECT * FROM {table_name}""" conn = create_engine('') df = pd.read_sql(sql, conn, parse_dates=['datetime']) RANDOM_SEED = 101 X_train, X_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED) X_train = X_train.drop(['sensor_id'], axis=1) train_dates = X_train['datetime'] X_train = X_train.drop(['datetime'], axis=1) X_test = X_test.drop(['sensor_id'], axis=1) test_dates = X_test['datetime'] X_test = X_test.drop(['datetime'], axis=1) X_train = X_train.values X_test = X_test.values scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.fit_transform(X_test) clf = AutoEncoder(hidden_neurons=[2, 1, 2], verbose=0, contamination=0.05) clf.fit(X_train_scaled) df_history = pd.DataFrame(clf.history_)
) x_train = data_dict["train"] x_test = data_dict["test"] x_test_labels = data_dict["test_labels"] start = time.time() # data preprocessing for MSCRED od = AutoEncoder( hidden_neurons=hidden_neurons, batch_size=batch_size, epochs=epochs, l2_regularizer=l2_regularizer, verbose=1, ) od.fit(x_train) # get outlier scores anomaly_score = od.decision_function(x_test) anomaly_label = x_test_labels end = time.time() time = end - start evaluate_all(anomaly_score, anomaly_label) salience = compute_salience(anomaly_score, anomaly_label) print('time') print(' ', time) print('salience')