Esempio n. 1
0
    def detectWithAutoencoder(self):
        '''
        Apply the Autoencoder Detection Method.
        '''
        # Find Model Hyperparameters
        hpMap = self.config['AnomalyDetector']['AutoencoderHyperparameters']

        # Create and Fit the Autoencoder Model
        AE = AutoEncoder(hidden_neurons=[1 for i in range(hpMap['depth'])])
        AE.fit(self.y.reshape(-1, 1))

        # Get & Plot Anomaly Scores for the Observations
        anomalyScores = AE.decision_scores_
        self.plotAnomalyScores(anomalyScores)

        # Report the Lon/Lat Points Corresponding to the Anomalies
        # in the Order of Decreasing Anomaly Score (i.e., the Most
        # Anomalous Points are Shown First)
        anomalyIdxList = [self.y[i] for i in range(self.y.shape[0]) \
                          if anomalyScores[i] >= hpMap['anomalyScoreCutoff']]
        anomalyLonLatMap = {anomalyScores[i]: (self.M['longitude'][idx], self.M['latitude'][idx]) \
                            for idx in anomalyIdxList}
        sortedScores = sorted(anomalyScores)
        anomaliesLonLatSorted = [anomalyLonLatMap[sortedScores[i]] for i in range(len(sortedScores)) \
                                 if sortedScores[i] in anomalyLonLatMap]
        return anomaliesLonLatSorted
Esempio n. 2
0
def training(data, img_shape, re_sample_type, text_len, permission_names,
             extract_f):
    # load training data
    print('preparing training data')
    inputs, permissions = prepare_training_data(data, img_shape,
                                                re_sample_type, text_len,
                                                permission_names)

    # get features
    print('generating training features')
    features = extract_f.predict(inputs)

    # train auto encoder model, knn model
    print('training outlier model + knn model')
    detectors = []
    knn_trees = []
    features_in_permissions = [
    ]  # features in each permission, [permission_id, feature_id]
    for p in permission_names:
        print('training', p, '...')
        features_current = []
        for i in range(len(permissions)):
            if p in permissions[i]:
                features_current.append(features[i])
        features_in_permissions.append(features_current)

        detector = AutoEncoder(epochs=200, verbose=0)
        detector.fit(features_current)
        detectors.append(detector)

        knn = KNN()
        knn.fit(features_current)
        knn_trees.append(knn)

    return detectors, knn_trees, features_in_permissions
Esempio n. 3
0
def anomaly_detection(data, label):
    X = data[data.select_dtypes('number').columns.tolist()]
    y = data[label]
    y = y.values
    X = X.drop([label], axis=1)

    sc = StandardScaler()
    X = pd.DataFrame(data=sc.fit_transform(X), columns=X.columns)

    ifo = IForest(contamination=0.01,
                  behaviour='new',
                  n_estimators=1000,
                  max_samples=1024,
                  n_jobs=-1,
                  verbose=1)
    ifo.fit(X)
    ifo_pred = ifo.labels_
    print('ROC score for Isolation forest: ', roc_auc_score(y, ifo_pred))
    utilities.plot_outlier_scores(
        y,
        ifo.decision_scores_,
        bw=0.1,
        title='Fraud, Isolation forest. (n_estimators={})'.format(
            ifo.n_estimators))

    ae = AutoEncoder(hidden_neurons=[25, 20, 15, 20, 25],
                     hidden_activation='relu',
                     output_activation='sigmoid',
                     optimizer='adam',
                     epochs=20,
                     batch_size=128,
                     dropout_rate=0.2,
                     l2_regularizer=0.0,
                     validation_size=0.1,
                     preprocessing=False,
                     verbose=1,
                     random_state=1,
                     contamination=0.01)
    ae.fit(X)
    ae_pred = ae.labels_
    print('ROC score for Autoencoder: ', roc_auc_score(y, ae_pred))
    utilities.plot_outlier_scores(
        y,
        ae.decision_scores_,
        bw=0.1,
        title='Fraud, Autoencoder. (epochs={})'.format(ae.epochs))

    # Too long to train, under-sample needed
    lof = LOF(n_neighbors=int(y.sum() * 1.3), contamination=0.01, n_jobs=-1)
    lof.fit(X)
    lof_pred = lof.labels_
    print('ROC score for LOF: ', roc_auc_score(y, lof_pred))
    utilities.plot_outlier_scores(
        y,
        lof.decision_scores_,
        bw=0.1,
        title='Fraud, Local outliers factor. (n_neighbors={})'.format(
            lof.n_neighbors))

    return y, ifo_pred, ae_pred, lof_pred
Esempio n. 4
0
def autoencoder_outlier_detection(X_train, X_test, **kwargs):
    detector = AutoEncoder(**kwargs)
    detector.fit(X_train)
    prob = detector.predict_proba(X_test)[:, -1]

    if isinstance(X_test, pd.DataFrame):
        return pd.Series(prob, name='outlier', index=X_test.index)
    return pd.Series(prob, name='outlier')
Esempio n. 5
0
 def aeAD(self, hidden_neurons, epochs):
     # train AutoEncoder detector
     clf_name = 'AutoEncoder'
     clf = AutoEncoder(hidden_neurons = hidden_neurons, epochs=epochs)
     clf.fit(self.X)
     # get the prediction labels and outlier scores of the training data
     y_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
     y_scores = clf.decision_scores_  # raw outlier scores
     generateAnomalis(self.data, self.label, y_pred)
def getOutlierAutoEncoder(dataset):
    '''
    @brief Function that executes AutoEncoder algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model with 3 layers of neurons and 8, 6, 8 neurons per layer without verbose
    ae = AutoEncoder(hidden_neurons=[8, 6, 8], verbose=0)
    # Fits the data and obtains labels
    ae.fit(dataset)
    # Return labels
    return ae.labels_
def detect_outliers(lst):
    clf = AutoEncoder(verbose=1)
    clf.fit(lst)
    
    inliers = []
    for index, data in enumerate(lst):
        y = clf.predict(data.reshape(1,-1))
        if y: # y==1 for outliers
            logger.warning('Found outlier: {0}'.format(index))
        else:
            inliers.append(data)

    logger.info('{:.0%} are outliers'.format(1 - len(inliers) / len(lst)))
    return inliers
Esempio n. 8
0
def main():
    plt.close('all')
    matplotlib.use('Qt5Agg')  # override PyCharm pro's scientific view

    create_links()

    warnings.showwarning = silence_warnings
    contamination = 0.1  # percentage of outliers
    n_train = 500  # number of training points
    n_test = 500  # number of testing points
    n_features = 25  # Number of features

    X_test, y_test, X_train, y_train = _generate_random_data(
        contamination, n_features, n_test, n_train)
    # X_test, y_test, X_train, y_train = ?

    _plot_using_pca(X_train, y_train)

    hidden_neurons = [25, 2, 2, 25]
    clf1 = AutoEncoder(hidden_neurons=hidden_neurons)
    clf1.fit(X_train)
    y_train_scores = clf1.decision_scores_

    # Predict the anomaly scores
    y_test_scores = clf1.decision_function(X_test)  # outlier scores
    y_test_scores = pd.Series(y_test_scores)

    # Plot anomaly scores
    plt.hist(y_test_scores, bins='auto')
    plt.title("Histogram for Model Clf1 Anomaly Scores")
    plt.show()

    manual_score_thres = 4
    df_test = X_test.copy()
    df_test['score'] = y_test_scores
    # assign cluster=0 to samples with low anomaly score, and cluster=1 to samples with high anomaly score.
    df_test['cluster'] = np.where(df_test['score'] < manual_score_thres, 0, 1)
    df_test['cluster'].value_counts()

    df_test.groupby('cluster').mean()
    print(df_test)
Esempio n. 9
0
class AutoEncoder(Detector):
    def __init__(self, **kwargs):
        super().__init__()
        self._model = AutoEncoderPyod(**kwargs)

    def _fit(self, data):

        self._model.fit(data)

        return self

    def _detect(self, data):
        return self._model.predict(data)

    def validate(self, data):
        if isinstance(data, pd.Series):
            return data.values.reshape(-1, 1)
        else:
            return data

    def __str__(self):
        return f"{self.__class__.__name__}({self._model})"
Esempio n. 10
0
def remove_outlier_faces(image_paths: list, image_size: int = 160) -> list:

    faces = []
    for image_path, bboxes in zip(image_paths, detect_faces(image_paths)):
        im = Image.open(image_path)
        for bbox in bboxes:
            face = Face(idx=image_path, img=im, bbox=bbox)
            faces.append(face)

    clf = AutoEncoder(verbose=1)
    clf.fit([face.embedding for face in faces])

    inliers = []
    for face in faces:
        y = clf.predict(embedding.reshape(1, -1))

        if y == 0:
            face.face_img.save(image_path)
            inliers.append(embedding)

    logger.info('{:.0%} are outliers'.format(1 - len(inliers) / len(lst)))
    return inliers
Esempio n. 11
0
def ele_outliers(num):
    dataSetType = ALL_DATA_TYPE[0]
    trainType = ALL_TRAIN_TYPE[1]
    
    X, yc = load_data(dataSetType, trainType, num)

    # 10 fold validation
    KF = KFold(n_splits=10, shuffle=True, random_state=10)
    report_list = []
    for train_index, test_index in KF.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = yc[train_index], yc[test_index]

        # split into train and test
        # X_train, X_test, y_train, y_test = train_test_split(X, yc, test_size=0.2, random_state=10)
        # split train to ele and mice
        X_train_ele = X_train[y_train == 1]
        X_train_mice = X_train[y_train == 0]

        # use mice to fit the model mice: 1, ele: -1
        # clf = svm.OneClassSVM(nu=nu, kernel='rbf', gamma='scale')
        # clf = IsolationForest(max_samples=0.2, n_estimators=300, contamination=conta, random_state=rng)
        # clf.fit(X_train_mice)
        clf_name = 'AutoEncoder'
        clf = AutoEncoder(hidden_neurons=[256, 64, 20, 64, 256], epochs=epochs, contamination=conta, random_state=10, verbose=0)
        clf.fit(X_train_mice)

        y_pred_test = clf.predict(X_test)
        # get outlier scores
        y_pred_scores = clf.decision_function(X_test)

        c_matrix = confusion_matrix(y_test, y_pred_test)
        print(c_matrix)
        temp_report = classification_report(y_test, y_pred_test, output_dict=True)
        report_list.append(temp_report)
        print(classification_report(y_test, y_pred_test, output_dict=False))
        # evaluate_print(clf_name, y_pred_test, y_pred_scores)
    final_report = get_avg_report(report_list)
    print("final report", final_report)
Esempio n. 12
0
class AutoEncoderODD(abstract_occ_model):
    def __init__(self,
                 hidden_neurons,
                 nu,
                 epochs,
                 batch_size=32,
                 output_activation='sigmoid'):
        self.model = AutoEncoder(hidden_neurons=hidden_neurons,
                                 contamination=nu,
                                 epochs=epochs,
                                 batch_size=batch_size,
                                 validation_size=0,
                                 output_activation=output_activation)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        prediction = self.model.predict(X)
        return np.where(prediction == 0.0, 1,
                        np.where(prediction == 1.0, -1, prediction))

    def score_samples(self, X):
        return -self.model.decision_function(X)
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)

# Reduce the redunant data
X24 = tsne.fit_transform(unique4)
plt.figure(figsize=(20, 20))
plt.scatter(X24[:, 0], X24[:, 1], c=iforest4.labels_)
plt.show()

# In[29]:

from pyod.models.auto_encoder import AutoEncoder

autoencoder2 = AutoEncoder(hidden_neurons=[16, 8, 8, 16])
autoencoder2.fit(unique2)

# In[30]:

from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)

# Reduce the redunant data
X42 = tsne.fit_transform(unique2)
plt.figure(figsize=(20, 20))
plt.scatter(X42[:, 0], X42[:, 1], c=autoencoder2.labels_)
plt.show()

# In[1]:

autoencoder2 = AutoEncoder(hidden_neurons=[16, 8, 8, 16])
Esempio n. 14
0
test_index = [item for item in list(data.index) if item not in train_index]

train = data.loc[train_index, df.columns].reset_index(drop=False)
test = data.loc[test_index, df.columns].reset_index(drop=False)
train = train.apply(pd.to_numeric)
test = test.apply(pd.to_numeric)
train_x = train.drop(columns=['user_id', 'index'])
test_x = test.drop(columns=['user_id', 'index'])

np.any(np.isnan(train_x))
np.all(np.isfinite(train_x))
train_norm = StandardScaler().fit_transform(train_x.dropna())
test_norm = StandardScaler().fit_transform(test_x.dropna())

clf1 = AutoEncoder(hidden_neurons=[25, 2, 2, 25])
clf1.fit(train_norm)

y_train_scores = clf1.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf1.predict(test_norm)  # outlier labels (0 or 1)

y_test_scores = clf1.decision_function(test_norm)  # outlier scores

y_test_pred = pd.Series(y_test_pred)
y_test_scores = pd.Series(y_test_scores)

y_test_pred.value_counts()

y_test_scores.describe()
    n_train = 20000  # number of training points
    n_test = 2000  # number of testing points
    n_features = 300  # number of features

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=n_features,
                      contamination=contamination,
                      random_state=42)

    # train AutoEncoder detector
    clf_name = 'AutoEncoder'
    clf = AutoEncoder(epochs=30, contamination=contamination)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Esempio n. 16
0
x_pca = pd.DataFrame(x_pca)
x_pca.columns=['PC1','PC2']

# Plot
import matplotlib.pyplot as plt
plt.scatter(X_train[0], X_train[1], c=y_train, alpha=0.8)
plt.title('Scatter plot')
plt.xlabel('x')
plt.ylabel('y')
plt.show()



# Step 1: Build the model
clf1 = AutoEncoder(hidden_neurons =[25, 2, 2, 25])
clf1.fit(X_train)

clf2 = AutoEncoder(hidden_neurons =[25, 10,2, 10, 25])
clf2.fit(X_train)

clf3 = AutoEncoder(hidden_neurons =[25, 15, 10, 2, 10,15, 25])
clf3.fit(X_train)

# Predict the anomaly scores
y_test_scores = clf1.decision_function(X_test)  
y_test_scores = pd.Series(y_test_scores)

# Step 2: Determine the cut point
import matplotlib.pyplot as plt
plt.hist(y_test_scores, bins='auto')  
plt.title("Histogram with Model Clf3 Anomaly Scores")
Esempio n. 17
0
class TestAutoEncoder(unittest.TestCase):
    def setUp(self):
        self.n_train = 6000
        self.n_test = 1000
        self.n_features = 300
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            n_features=self.n_features,
            contamination=self.contamination,
            random_state=42)

        self.clf = AutoEncoder(epochs=5, contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert (hasattr(self.clf, 'model_') and self.clf.model_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_prediction_labels_confidence(self):
        pred_labels, confidence = self.clf.predict(self.X_test,
                                                   return_confidence=True)
        assert_equal(pred_labels.shape, self.y_test.shape)
        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_prediction_proba_linear_confidence(self):
        pred_proba, confidence = self.clf.predict_proba(self.X_test,
                                                        method='linear',
                                                        return_confidence=True)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_model_clone(self):
        # for deep models this may not apply
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
Esempio n. 18
0
# df.fillna(0)
# df.to_excel('nnViewDataTest.xlsx')
# print(df)

nnData = pd.read_excel("nnViewData.xlsx")
nnData = nnData.drop(['date_time'], axis=1)
dropCleanScale = StandardScaler().fit_transform(nnData)
dropCleanScale = pd.DataFrame(dropCleanScale)

nnDataTest = pd.read_excel("nnViewDataTest.xlsx")
nnDataTest = nnDataTest.drop(['date_time'], axis=1)
dropCleanScaleTest = StandardScaler().fit_transform(nnDataTest)
dropCleanScaleTest = pd.DataFrame(dropCleanScaleTest)

clf1 = AutoEncoder(hidden_neurons=[14, 2, 2, 14])
clf1.fit(dropCleanScale)
y_train_scores1 = clf1.decision_scores_

clf2 = AutoEncoder(hidden_neurons=[14, 10, 2, 10, 14])
clf2.fit(dropCleanScale)
y_train_scores2 = clf2.decision_scores_

y_test1 = clf1.decision_function(dropCleanScaleTest)
y_test2 = clf2.decision_function(dropCleanScaleTest)
## plotting the Remaining lifetime score

plt.hist(y_test1, bins='auto', color='green')
plt.hist(y_test2, bins='auto', color='blue')

plt.title("Histogram for Model Clf1 Anomaly Scores")
plt.show()
Esempio n. 19
0
def ele_outliers(num):
    # num = 10
    # fileName1 = "/data/sym/one-class-svm/data/mean_of_five/dec-feature/caida-A-50W-5-{}.csv".format(num)
    # fileName2 = "/data/sym/one-class-svm/data/mean_of_five/bin-feature/caida-A-50W-5-{}.csv".format(num)
    # fileName1 = "/data/sym/one-class-svm/data/mean_of_five/dec-feature/univ1-50W-{0}-{1}.csv".format(5, num)
    # fileName2 = "/data/sym/one-class-svm/data/mean_of_five/bin-feature/univ1-50W-{0}-{1}.csv".format(5, num)
    fileName1 = "data/dec-test.csv"
    fileName2 = "data/bin-test.csv"
    df = pd.read_csv(fileName1)
    dfb = pd.read_csv(fileName2)

    #conver to matrix
    X = dfb.values
    X[X == '0'] = -1
    X[X == '1'] = 1
    yr = df['flowSize']

    # thres = int(sys.argv[1])

    yc = yr.copy(deep=True)
    yc[yr <= thres] = 0
    yc[yr > thres] = 1
    print("original mice count: ", sum(yc == 0))
    print("original elephant count: ", sum(yc == 1))

    # 10 fold validation
    KF = KFold(n_splits=10, shuffle=True, random_state=10)
    report_list = []
    for train_index, test_index in KF.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = yc[train_index], yc[test_index]

        # split into train and test
        # X_train, X_test, y_train, y_test = train_test_split(X, yc, test_size=0.2, random_state=10)
        # split train to ele and mice
        X_train_ele = X_train[y_train == 1]
        X_train_mice = X_train[y_train == 0]

        # use mice to fit the model mice: 1, ele: -1
        # clf = svm.OneClassSVM(nu=nu, kernel='rbf', gamma='scale')
        # clf = IsolationForest(max_samples=0.2, n_estimators=300, contamination=conta, random_state=rng)
        # clf.fit(X_train_mice)
        clf_name = 'AutoEncoder'
        clf = AutoEncoder(hidden_neurons=[256, 64, 20, 64, 256],
                          epochs=epochs,
                          contamination=conta,
                          random_state=10,
                          verbose=0)
        clf.fit(X_train_mice)

        y_pred_test = clf.predict(X_test)
        # get outlier scores
        y_pred_scores = clf.decision_function(X_test)

        c_matrix = confusion_matrix(y_test, y_pred_test)
        print(c_matrix)
        temp_report = classification_report(y_test,
                                            y_pred_test,
                                            output_dict=True)
        report_list.append(temp_report)
        print(classification_report(y_test, y_pred_test, output_dict=False))
        # evaluate_print(clf_name, y_pred_test, y_pred_scores)
    final_report = get_avg_report(report_list)
    print("final report", final_report)
Esempio n. 20
0
for i in range(0, len(pred_1)):
  if pred_1[i] == 0:
    out_1.append('Normal')
  else:
    out_1.append('Abnormal')

state_1 = pd.DataFrame(out_1, columns = ['Condition'])
state_1 = state_1.loc[state_1['Condition'] == 'Abnormal'] 
ab_state1 = list(state_1.index.values.tolist())

#Deep Learning model using pyod library
clf_2 = AutoEncoder(hidden_neurons = [15, 64, 32, 64, 15], epochs = 350, 
                    batch_size = 128, preprocessing = False, verbose = 0,
                    random_state = 1234, contamination = 0.1,
                    validation_size = 0.3)
clf_2.fit(X_train)
pred_2 = clf_2.predict(X_train)

#output of the DL model
out_2 = []
for i in range(0, len(pred_2)):
  if pred_2[i] == 0:
    out_2.append('Normal')
  else:
    out_2.append('Abnormal')

state_2 = pd.DataFrame(out_2, columns = ['Condition'])
state_2 = state_2.loc[state_2['Condition'] == 'Abnormal'] 
ab_state2 = list(state_2.index.values.tolist())

#plotting results for three ranges of sensor value and two detectors mentioned above
Esempio n. 21
0
class TestAutoEncoder(unittest.TestCase):
    def setUp(self):
        self.n_train = 6000
        self.n_test = 1000
        self.n_features = 300
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            n_features=self.n_features,
            contamination=self.contamination,
            random_state=42)

        self.clf = AutoEncoder(epochs=5, contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        # TODO: fix estimator check for AutoEncoder
        # check_estimator(self.clf)
        pass

    def test_parameters(self):
        assert_true(
            hasattr(self.clf, 'decision_scores_')
            and self.clf.decision_scores_ is not None)
        assert_true(
            hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert_true(
            hasattr(self.clf, 'threshold_')
            and self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert_true(
            hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert_true(
            hasattr(self.clf, 'model_') and self.clf.model_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def tearDown(self):
        pass
Esempio n. 22
0
    IF = IsolationForest(random_state=r)
    IF.fit(X_train)
    sklearn_score_anomalies = IF.decision_function(X_test)
    original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies]
    aucs_if_ws[r] = evaluate.AUC(original_paper_score, y_test)
auc_if_ws = np.mean(aucs_if_ws)

# --- T2 --- #
y_pred_proba_hot = Hoteliing_SPC_proba(X_train, X_test)
auc_hot_ws = evaluate.AUC(y_pred_proba_hot, y_test)

# --- AutoEncoder --- #
aucs_ae_ws = np.zeros(num_of_experiments)
for r in range(num_of_experiments):
    AE = AutoEncoder(hidden_neurons=[64, 6, 6, 64], random_state=r)
    AE.fit(X_train)
    ae_pred_proba = AE.predict_proba(X_test)[:, 1]
    aucs_ae_ws[r] = evaluate.AUC(ae_pred_proba, y_test)
auc_ae_ws = np.mean(aucs_ae_ws)

# --- one-class-SVM --- #
clf = svm.OneClassSVM(kernel="rbf")
clf.fit(X_train)
sklearn_score_anomalies = clf.decision_function(X_test)
original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies]
auc_svm_ws = evaluate.AUC(original_paper_score, y_test)

# --- LOF --- #
lof = LocalOutlierFactor(novelty=True)
lof.fit(X_train)
sklearn_score_anomalies = lof.decision_function(X_test)
Esempio n. 23
0
from pyod.models.auto_encoder import AutoEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

table_name = 'grow_data_0a05p06e'
sql = f"""SELECT * FROM {table_name}"""
conn = create_engine('')
df = pd.read_sql(sql, conn, parse_dates=['datetime'])

RANDOM_SEED = 101
X_train, X_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)

X_train = X_train.drop(['sensor_id'], axis=1)
train_dates = X_train['datetime']
X_train = X_train.drop(['datetime'], axis=1)

X_test = X_test.drop(['sensor_id'], axis=1)
test_dates = X_test['datetime']
X_test = X_test.drop(['datetime'], axis=1)

X_train = X_train.values
X_test = X_test.values

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

clf = AutoEncoder(hidden_neurons=[2, 1, 2], verbose=0, contamination=0.05)
clf.fit(X_train_scaled)
df_history = pd.DataFrame(clf.history_)
Esempio n. 24
0
    )

    x_train = data_dict["train"]
    x_test = data_dict["test"]
    x_test_labels = data_dict["test_labels"]

    start = time.time()
    # data preprocessing for MSCRED
    od = AutoEncoder(
        hidden_neurons=hidden_neurons,
        batch_size=batch_size,
        epochs=epochs,
        l2_regularizer=l2_regularizer,
        verbose=1,
    )
    od.fit(x_train)

    # get outlier scores
    anomaly_score = od.decision_function(x_test)

    anomaly_label = x_test_labels

    end = time.time()

    time = end - start

    evaluate_all(anomaly_score, anomaly_label)
    salience = compute_salience(anomaly_score, anomaly_label)
    print('time')
    print('   ', time)
    print('salience')