Beispiel #1
0
    def detectWithAutoencoder(self):
        '''
        Apply the Autoencoder Detection Method.
        '''
        # Find Model Hyperparameters
        hpMap = self.config['AnomalyDetector']['AutoencoderHyperparameters']

        # Create and Fit the Autoencoder Model
        AE = AutoEncoder(hidden_neurons=[1 for i in range(hpMap['depth'])])
        AE.fit(self.y.reshape(-1, 1))

        # Get & Plot Anomaly Scores for the Observations
        anomalyScores = AE.decision_scores_
        self.plotAnomalyScores(anomalyScores)

        # Report the Lon/Lat Points Corresponding to the Anomalies
        # in the Order of Decreasing Anomaly Score (i.e., the Most
        # Anomalous Points are Shown First)
        anomalyIdxList = [self.y[i] for i in range(self.y.shape[0]) \
                          if anomalyScores[i] >= hpMap['anomalyScoreCutoff']]
        anomalyLonLatMap = {anomalyScores[i]: (self.M['longitude'][idx], self.M['latitude'][idx]) \
                            for idx in anomalyIdxList}
        sortedScores = sorted(anomalyScores)
        anomaliesLonLatSorted = [anomalyLonLatMap[sortedScores[i]] for i in range(len(sortedScores)) \
                                 if sortedScores[i] in anomalyLonLatMap]
        return anomaliesLonLatSorted
Beispiel #2
0
def training(data, img_shape, re_sample_type, text_len, permission_names,
             extract_f):
    # load training data
    print('preparing training data')
    inputs, permissions = prepare_training_data(data, img_shape,
                                                re_sample_type, text_len,
                                                permission_names)

    # get features
    print('generating training features')
    features = extract_f.predict(inputs)

    # train auto encoder model, knn model
    print('training outlier model + knn model')
    detectors = []
    knn_trees = []
    features_in_permissions = [
    ]  # features in each permission, [permission_id, feature_id]
    for p in permission_names:
        print('training', p, '...')
        features_current = []
        for i in range(len(permissions)):
            if p in permissions[i]:
                features_current.append(features[i])
        features_in_permissions.append(features_current)

        detector = AutoEncoder(epochs=200, verbose=0)
        detector.fit(features_current)
        detectors.append(detector)

        knn = KNN()
        knn.fit(features_current)
        knn_trees.append(knn)

    return detectors, knn_trees, features_in_permissions
Beispiel #3
0
def anomaly_detection(data, label):
    X = data[data.select_dtypes('number').columns.tolist()]
    y = data[label]
    y = y.values
    X = X.drop([label], axis=1)

    sc = StandardScaler()
    X = pd.DataFrame(data=sc.fit_transform(X), columns=X.columns)

    ifo = IForest(contamination=0.01,
                  behaviour='new',
                  n_estimators=1000,
                  max_samples=1024,
                  n_jobs=-1,
                  verbose=1)
    ifo.fit(X)
    ifo_pred = ifo.labels_
    print('ROC score for Isolation forest: ', roc_auc_score(y, ifo_pred))
    utilities.plot_outlier_scores(
        y,
        ifo.decision_scores_,
        bw=0.1,
        title='Fraud, Isolation forest. (n_estimators={})'.format(
            ifo.n_estimators))

    ae = AutoEncoder(hidden_neurons=[25, 20, 15, 20, 25],
                     hidden_activation='relu',
                     output_activation='sigmoid',
                     optimizer='adam',
                     epochs=20,
                     batch_size=128,
                     dropout_rate=0.2,
                     l2_regularizer=0.0,
                     validation_size=0.1,
                     preprocessing=False,
                     verbose=1,
                     random_state=1,
                     contamination=0.01)
    ae.fit(X)
    ae_pred = ae.labels_
    print('ROC score for Autoencoder: ', roc_auc_score(y, ae_pred))
    utilities.plot_outlier_scores(
        y,
        ae.decision_scores_,
        bw=0.1,
        title='Fraud, Autoencoder. (epochs={})'.format(ae.epochs))

    # Too long to train, under-sample needed
    lof = LOF(n_neighbors=int(y.sum() * 1.3), contamination=0.01, n_jobs=-1)
    lof.fit(X)
    lof_pred = lof.labels_
    print('ROC score for LOF: ', roc_auc_score(y, lof_pred))
    utilities.plot_outlier_scores(
        y,
        lof.decision_scores_,
        bw=0.1,
        title='Fraud, Local outliers factor. (n_neighbors={})'.format(
            lof.n_neighbors))

    return y, ifo_pred, ae_pred, lof_pred
Beispiel #4
0
def autoencoder_outlier_detection(X_train, X_test, **kwargs):
    detector = AutoEncoder(**kwargs)
    detector.fit(X_train)
    prob = detector.predict_proba(X_test)[:, -1]

    if isinstance(X_test, pd.DataFrame):
        return pd.Series(prob, name='outlier', index=X_test.index)
    return pd.Series(prob, name='outlier')
Beispiel #5
0
 def aeAD(self, hidden_neurons, epochs):
     # train AutoEncoder detector
     clf_name = 'AutoEncoder'
     clf = AutoEncoder(hidden_neurons = hidden_neurons, epochs=epochs)
     clf.fit(self.X)
     # get the prediction labels and outlier scores of the training data
     y_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
     y_scores = clf.decision_scores_  # raw outlier scores
     generateAnomalis(self.data, self.label, y_pred)
Beispiel #6
0
 def __init__(self,
              hidden_neurons,
              nu,
              epochs,
              batch_size=32,
              output_activation='sigmoid'):
     self.model = AutoEncoder(hidden_neurons=hidden_neurons,
                              contamination=nu,
                              epochs=epochs,
                              batch_size=batch_size,
                              validation_size=0,
                              output_activation=output_activation)
def getOutlierAutoEncoder(dataset):
    '''
    @brief Function that executes AutoEncoder algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model with 3 layers of neurons and 8, 6, 8 neurons per layer without verbose
    ae = AutoEncoder(hidden_neurons=[8, 6, 8], verbose=0)
    # Fits the data and obtains labels
    ae.fit(dataset)
    # Return labels
    return ae.labels_
def detect_outliers(lst):
    clf = AutoEncoder(verbose=1)
    clf.fit(lst)
    
    inliers = []
    for index, data in enumerate(lst):
        y = clf.predict(data.reshape(1,-1))
        if y: # y==1 for outliers
            logger.warning('Found outlier: {0}'.format(index))
        else:
            inliers.append(data)

    logger.info('{:.0%} are outliers'.format(1 - len(inliers) / len(lst)))
    return inliers
Beispiel #9
0
    def setUp(self):
        self.n_train = 6000
        self.n_test = 1000
        self.n_features = 300
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            n_features=self.n_features,
            contamination=self.contamination,
            random_state=42)

        self.clf = AutoEncoder(epochs=5, contamination=self.contamination)
        self.clf.fit(self.X_train)
Beispiel #10
0
def main():

    scalers = ['no', 'std', 'minmax']
    root = 'Unsupervised_Anamaly_Detection_csv'
    start = 0
    counts = 90
    CPUS = 3
    CPUS_Models = 4
    sklearn_models = [
        'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS',
        'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
        'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal',
        'VAE', 'AutoEncoder'
    ]

    models = {
        'BRM': BRM(bootstrap_sample_percent=70),
        'GM': GaussianMixture(),
        'IF': IsolationForest(),
        'OCSVM': OneClassSVM(),
        'EE': EllipticEnvelope(),
        'AvgKNN': KNN(method='mean'),
        'LargestKNN': KNN(method='largest'),
        'MedKNN': KNN(method='median'),
        'PCA': PCA(),
        'COF': COF(),
        'LODA': LODA(),
        'LOF': LOF(),
        'HBOS': HBOS(),
        'MCD': MCD(),
        'AvgBagging': FeatureBagging(combination='average'),
        'MaxBagging': FeatureBagging(combination='max'),
        'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'FactorAnalysis': FactorAnalysis(),
        'KernelDensity': KernelDensity(),
        'COPOD': COPOD(),
        'SOD': SOD(),
        'LSCPwithLODA': LSCP([LODA(), LODA()]),
        'AveLMDD': LMDD(dis_measure='aad'),
        'VarLMDD': LMDD(dis_measure='var'),
        'IqrLMDD': LMDD(dis_measure='iqr'),
        'SoGaal': SO_GAAL(),
        'MoGaal': MO_GAAL(),
        'VAE': VAE(encoder_neurons=[8, 4, 2]),
        'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'OCKRA': m_OCKRA(),
    }

    name = "30_Models"

    Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root, scaler, models, start, counts,
          other_models=sklearn_models,
          CPUS=CPUS_Models,
          save_name=name)
         for scaler in scalers)
Beispiel #11
0
def main():
    plt.close('all')
    matplotlib.use('Qt5Agg')  # override PyCharm pro's scientific view

    create_links()

    warnings.showwarning = silence_warnings
    contamination = 0.1  # percentage of outliers
    n_train = 500  # number of training points
    n_test = 500  # number of testing points
    n_features = 25  # Number of features

    X_test, y_test, X_train, y_train = _generate_random_data(
        contamination, n_features, n_test, n_train)
    # X_test, y_test, X_train, y_train = ?

    _plot_using_pca(X_train, y_train)

    hidden_neurons = [25, 2, 2, 25]
    clf1 = AutoEncoder(hidden_neurons=hidden_neurons)
    clf1.fit(X_train)
    y_train_scores = clf1.decision_scores_

    # Predict the anomaly scores
    y_test_scores = clf1.decision_function(X_test)  # outlier scores
    y_test_scores = pd.Series(y_test_scores)

    # Plot anomaly scores
    plt.hist(y_test_scores, bins='auto')
    plt.title("Histogram for Model Clf1 Anomaly Scores")
    plt.show()

    manual_score_thres = 4
    df_test = X_test.copy()
    df_test['score'] = y_test_scores
    # assign cluster=0 to samples with low anomaly score, and cluster=1 to samples with high anomaly score.
    df_test['cluster'] = np.where(df_test['score'] < manual_score_thres, 0, 1)
    df_test['cluster'].value_counts()

    df_test.groupby('cluster').mean()
    print(df_test)
def remove_outlier_faces(image_paths: list, image_size: int = 160) -> list:

    faces = []
    for image_path, bboxes in zip(image_paths, detect_faces(image_paths)):
        im = Image.open(image_path)
        for bbox in bboxes:
            face = Face(idx=image_path, img=im, bbox=bbox)
            faces.append(face)

    clf = AutoEncoder(verbose=1)
    clf.fit([face.embedding for face in faces])

    inliers = []
    for face in faces:
        y = clf.predict(embedding.reshape(1, -1))

        if y == 0:
            face.face_img.save(image_path)
            inliers.append(embedding)

    logger.info('{:.0%} are outliers'.format(1 - len(inliers) / len(lst)))
    return inliers
Beispiel #13
0
class AutoEncoder(Detector):
    def __init__(self, **kwargs):
        super().__init__()
        self._model = AutoEncoderPyod(**kwargs)

    def _fit(self, data):

        self._model.fit(data)

        return self

    def _detect(self, data):
        return self._model.predict(data)

    def validate(self, data):
        if isinstance(data, pd.Series):
            return data.values.reshape(-1, 1)
        else:
            return data

    def __str__(self):
        return f"{self.__class__.__name__}({self._model})"
Beispiel #14
0
def make_mlo(hub, data, train):
    '''
    Create the Machine Learning Object used for this sequence
    '''
    size = 0
    for chunk in data:
        size = len(chunk)
        break
    for chunk in train:
        size = len(chunk)
        break
    hidden_neurons = [size * 2, size, size, size * 2]
    return AutoEncoder(hidden_neurons=hidden_neurons, contamination=0.001)
Beispiel #15
0
    def __init__(self, hidden_neurons=[32],
                 hidden_activation='relu', output_activation='sigmoid',
                 loss=mean_squared_error, optimizer='adam',
                 epochs=30, batch_size=10, dropout_rate=0.1,
                 l2_regularizer=0.2, validation_size=0.1, preprocessing=True,
                 verbose=0, random_state=None, contamination=0.1,

                 BoW=None, featurize_confidence = "none", entity_check=False, 
                 prev_turn_context=0, input_feature_map=None, slice_vec=[], 
                 labels={"none": 0, "error": 1}, 
                 one_hot={"none": [0], "error": [1]}):
        
        AutoEncoder.__init__(self, hidden_neurons=hidden_neurons, 
                 hidden_activation=hidden_activation, output_activation=output_activation,
                 loss=mean_squared_error, optimizer=optimizer,
                 epochs=epochs, batch_size=batch_size, dropout_rate=dropout_rate,
                 l2_regularizer=l2_regularizer, validation_size=validation_size, preprocessing=preprocessing,
                 verbose=verbose, random_state=random_state, contamination=contamination)

        ErrorClassifier.__init__(self, BoW =BoW,  featurize_confidence=featurize_confidence, entity_check=entity_check, 
                                 prev_turn_context=prev_turn_context, input_feature_map=input_feature_map, 
                                 slice_vec=slice_vec,labels=labels, one_hot=one_hot)
Beispiel #16
0
def use_model(model, df_list, x_columns, params):
    predicted = []

    if model == 'knn':
        neigh = NearestNeighbors(n_neighbors=params['n'], p=params['p'])
        neigh.fit(df_list[0][x_columns])

        for i in range(len(df_list)):
            pred = neigh.kneighbors(df_list[i][x_columns])
            pred = [np.mean(i) for i in pred[0]]
            predicted.append(pred)

    elif model == 'svm':
        svm = OneClassSVM(kernel=params['kernel'])
        svm.fit(df_list[0][x_columns])

        for i in range(len(df_list)):
            pred = svm.score_samples(df_list[i][x_columns])
            maximum = max(pred)
            pred = [(x * -1) + maximum for x in pred]
            predicted.append(pred)

    elif model == 'ísolationForest':
        clf = IsolationForest(n_estimators=params['n_estimators'],
                              random_state=0)
        clf.fit(df_list[0][x_columns])

        for i in range(len(df_list)):
            pred = clf.score_samples(df_list[i][x_columns])
            pred = list(map(abs, pred))
            predicted.append(pred)

    elif model == 'autoencoder':
        clf = AutoEncoder(hidden_neurons=params['hidden_neurons'],
                          verbose=0,
                          random_state=0)
        clf.fit(df_list[0][x_columns])
        for i in range(len(df_list)):
            pred = clf.decision_function(df_list[i][x_columns])
            predicted.append(pred)

    elif model == 'lsanomaly':
        anomalymodel = lsanomaly.LSAnomaly(sigma=params['sigma'],
                                           rho=params['rho'])
        anomalymodel.fit(df_list[0][x_columns].to_numpy())
        for i in range(len(df_list)):
            pred = anomalymodel.predict_proba(df_list[i][x_columns].to_numpy())
            pred = [a[1] for a in pred]
            predicted.append(pred)

    return predicted
Beispiel #17
0
class AutoEncoderODD(abstract_occ_model):
    def __init__(self,
                 hidden_neurons,
                 nu,
                 epochs,
                 batch_size=32,
                 output_activation='sigmoid'):
        self.model = AutoEncoder(hidden_neurons=hidden_neurons,
                                 contamination=nu,
                                 epochs=epochs,
                                 batch_size=batch_size,
                                 validation_size=0,
                                 output_activation=output_activation)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        prediction = self.model.predict(X)
        return np.where(prediction == 0.0, 1,
                        np.where(prediction == 1.0, -1, prediction))

    def score_samples(self, X):
        return -self.model.decision_function(X)
    def fit(self, X, contamination=0.01):
        """
        Fit detector

        Args:
            X: pd.DataFrame
        """
        self.detectors = {
            "auto_encoder":
            AutoEncoder(
                epochs=256,
                validation_size=0,
                preprocessing=False,
                verbose=0,
                contamination=contamination,
            ),
        }
        # print("train_data.shape:", X.shape)
        # 数据预处理
        # 标准化
        X_train_norm, self.data_norm_scalar = standardizer(X, keep_scalar=True)
        # 归一化
        X_train_unif, self.data_unif_scalar = minmaxizer(X_train_norm,
                                                         keep_scalar=True)

        train_scores = np.zeros([X.shape[0], len(self.detectors)])
        thresholds = np.zeros([1, len(self.detectors)])
        # 训练
        for i, clf_name in enumerate(self.detectors):
            clf = self.detectors[clf_name]
            clf.fit(X_train_unif)
            train_scores[:, i] = clf.decision_scores_
            thresholds[:, i] = clf.threshold_
        # 训练集异常程度及阈值
        train_scores_norm, self.score_scalar = standardizer(train_scores,
                                                            keep_scalar=True)
        thresholds_norm = self.score_scalar.transform(thresholds)

        self.decision_scores = pd.DataFrame(average(train_scores_norm),
                                            index=X.index)
        self.decision_scores.columns = ["score"]
        self.threshold = average(thresholds_norm)[0]
        self.label = self.get_label(self.decision_scores)
def choose_model(model, nnet):
    """ among implemented in PyOD """
    clfs = {
        'AE':
        AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15),
        'VAE':
        VAE(encoder_neurons=nnet[:5],
            decoder_neurons=nnet[4:],
            contamination=0.1,
            epochs=13),
        'ABOD':
        ABOD(),
        'FeatureBagging':
        FeatureBagging(),
        'HBOS':
        HBOS(),
        'IForest':
        IForest(),
        'KNN':
        KNN(),
        'LOF':
        LOF(),
        'OCSVM':
        OCSVM(),
        'PCA':
        PCA(),
        'SOS':
        SOS(),
        'COF':
        COF(),
        'CBLOF':
        CBLOF(),
        'SOD':
        SOD(),
        'LOCI':
        LOCI(),
        'MCD':
        MCD()
    }
    return clfs[model]
Beispiel #20
0
def _create_model(hidden_neurons=None,
                  hidden_activation='relu',
                  output_activation='sigmoid',
                  optimizer='adam',
                  epochs=100,
                  batch_size=32,
                  dropout_rate=0.2,
                  l2_regularizer=0.1,
                  validation_size=0.1,
                  preprocessing=True,
                  verbose=2,
                  random_state=42,
                  contamination=0.1,
                  loss='binary_crossentropy'):
    """(Internal helper) Created an Autoencoder instance"""
    print(
        "Creating an Autoencoder with\nOutput Activation: {}\nLoss: {}\nOptimizer: {}"
        .format(output_activation, loss, optimizer))

    autoenc = AutoEncoder(hidden_neurons=hidden_neurons,
                          hidden_activation=hidden_activation,
                          output_activation=output_activation,
                          optimizer=optimizer,
                          epochs=int(epochs),
                          batch_size=int(batch_size),
                          dropout_rate=dropout_rate,
                          l2_regularizer=l2_regularizer,
                          validation_size=validation_size,
                          preprocessing=preprocessing,
                          verbose=verbose,
                          random_state=random_state,
                          contamination=contamination)

    print('Created Model: {}'.format(autoenc))

    return autoenc
Beispiel #21
0
def ele_outliers(num):
    dataSetType = ALL_DATA_TYPE[0]
    trainType = ALL_TRAIN_TYPE[1]
    
    X, yc = load_data(dataSetType, trainType, num)

    # 10 fold validation
    KF = KFold(n_splits=10, shuffle=True, random_state=10)
    report_list = []
    for train_index, test_index in KF.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = yc[train_index], yc[test_index]

        # split into train and test
        # X_train, X_test, y_train, y_test = train_test_split(X, yc, test_size=0.2, random_state=10)
        # split train to ele and mice
        X_train_ele = X_train[y_train == 1]
        X_train_mice = X_train[y_train == 0]

        # use mice to fit the model mice: 1, ele: -1
        # clf = svm.OneClassSVM(nu=nu, kernel='rbf', gamma='scale')
        # clf = IsolationForest(max_samples=0.2, n_estimators=300, contamination=conta, random_state=rng)
        # clf.fit(X_train_mice)
        clf_name = 'AutoEncoder'
        clf = AutoEncoder(hidden_neurons=[256, 64, 20, 64, 256], epochs=epochs, contamination=conta, random_state=10, verbose=0)
        clf.fit(X_train_mice)

        y_pred_test = clf.predict(X_test)
        # get outlier scores
        y_pred_scores = clf.decision_function(X_test)

        c_matrix = confusion_matrix(y_test, y_pred_test)
        print(c_matrix)
        temp_report = classification_report(y_test, y_pred_test, output_dict=True)
        report_list.append(temp_report)
        print(classification_report(y_test, y_pred_test, output_dict=False))
        # evaluate_print(clf_name, y_pred_test, y_pred_scores)
    final_report = get_avg_report(report_list)
    print("final report", final_report)
Beispiel #22
0
def pyod_anomaly_detection(type, contamination):
    X_train, y_train, X_test, y_test = data(type=type,
                                            contamination=contamination)
    if type == 'MAD':
        # train MAD detector
        clf_name = 'MAD'
        clf = MAD(threshold=3.5)
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores
        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)

        # visualize the results
        # making dimensions = 2 for visualising purpose only. By repeating same data each dimension.
        visualize(clf_name,
                  np.hstack((X_train, X_train)),
                  y_train,
                  np.hstack((X_test, X_test)),
                  y_test,
                  y_train_pred,
                  y_test_pred,
                  show_figure=True,
                  save_figure=False)
    elif type == 'ABOD':
        # train ABOD detector
        clf_name = 'ABOD'
        clf = ABOD()
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)

        # visualize the results
        visualize(clf_name,
                  X_train,
                  y_train,
                  X_test,
                  y_test,
                  y_train_pred,
                  y_test_pred,
                  show_figure=True,
                  save_figure=False)
    elif type == 'AutoEncoder':
        # train AutoEncoder detector
        clf_name = 'AutoEncoder'
        clf = AutoEncoder(epochs=30, contamination=contamination)
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)
Beispiel #23
0
        # 'K Nearest Neighbors (KNN)': KNN(
        #     contamination=outliers_fraction),
        # 'Average KNN': KNN(method='mean',
        #                     contamination=outliers_fraction),
        # 'Median KNN': KNN(method='median',
        #                     contamination=outliers_fraction),
        # 'Local Outlier Factor (LOF)':
        #     LOF(n_neighbors=35, contamination=outliers_fraction),
        # 'Minimum Covariance Determinant (MCD) MCD(
        #     contamination=outliers_fraction, random_state=random_state),
        # 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction, #NOTE: slow, never try again
        #                                 random_state=random_state),
        # 'Principal Component Analysis (PCA)': PCA(
        #     contamination=outliers_fraction, random_state=random_state),
        'AutoEncoder':
        AutoEncoder(epochs=2,
                    hidden_neurons=[4, 2, 4],
                    contamination=outliers_fraction),
        # 'Feature Bagging':
        #     FeatureBagging(LOF(n_neighbors=35),
        #                     contamination=outliers_fraction,
        #                     check_estimator=False,
        #                     random_state=random_state),
        # 'Angle-based Outlier Detector (ABOD)':
        #     ABOD(n_neighbors=10,
        #             contamination=outliers_fraction),
    }

    RunPyodOutlier(classifiers, outlier_save_path, isExtract=True)
    # RunPyodOutlier(classifiers,outlier_save_path,isExtract=False)
    contamination = 0.1  # percentage of outliers
    n_train = 20000  # number of training points
    n_test = 2000  # number of testing points
    n_features = 300  # number of features

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=n_features,
                      contamination=contamination,
                      random_state=42)

    # train AutoEncoder detector
    clf_name = 'AutoEncoder'
    clf = AutoEncoder(epochs=30, contamination=contamination)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Beispiel #25
0
    scalar.fit(scada_data)
    scada_data = scalar.transform(scada_data)

    contamination_fracs = [0.5, 0.4, 0.3, 0.2, 0.1, 0.08, 0.05, 0.04, 0.03, 0.02, 0.01,
                           0.008, 0.005, 0.004, 0.003, 0.002, 0.001]
    all_report_dfs = []
    for anomaly_frac in contamination_fracs:
        print("Running models with {} contamination rate".format(anomaly_frac))
        # Instantiate models
        model_names = ['knn', 'pca', 'cblof', 'iforest', 'autoencoder']
        models = [
            KNN(contamination=anomaly_frac, algorithm='kd_tree', n_neighbors=13, n_jobs=8),
            PCA(contamination=anomaly_frac, svd_solver='auto', standardization=False),
            CBLOF(contamination=anomaly_frac, n_clusters=16, n_jobs=8),
            IForest(contamination=0.1, n_estimators=100, n_jobs=8, behaviour='new'),
            AutoEncoder(contamination=anomaly_frac, hidden_neurons=[4, 2, 2, 4], hidden_activation='tanh',
                        batch_size=5000, epochs=200, preprocessing=False, verbose=0)
        ]
        X_train, X_test, y_train, y_test = generate_train_test(scada_data, contamination=anomaly_frac,
                                                               sensor_failure=fail, offset_pct=offset)
        reports = [train_and_evaluate(model, name, X_train, X_test, y_test, anomaly_frac)
                   for model, name in zip(models, model_names)]
        reports_df = pd.DataFrame(reports)
        all_report_dfs.append(reports_df)

    fulll_report_df = pd.concat(all_report_dfs)
    print(fulll_report_df)
    fulll_report_df.to_csv(log_dir + "full_report.csv")



Beispiel #26
0
def main():

    # PART 1:
    # Getting the predictions for each classifier
    # SK means: The classifier is from sklearn or works like sklearn
    # PY means: The classifier is from pyod or works like pyod

    models = {
        'SK_EE': EllipticEnvelope(),
        'SK_GM': GaussianMixture(),
        'SK_IF': IsolationForest(),
        'SK_OCSVM': OneClassSVM(),
        'SK_FA': FactorAnalysis(),
        'SK_KD': KernelDensity(),
        'PY_PCA': PCA(),
        'PY_COF': COF(),
        'PY_LODA': LODA(),
        'PY_LOF': LOF(),
        'PY_HBOS': HBOS(),
        'PY_MCD': MCD(),
        'PY_AvgKNN': KNN(method='mean'),
        'PY_LargestKNN': KNN(method='largest'),
        'PY_MedKNN': KNN(method='median'),
        'PY_AvgBagging': FeatureBagging(combination='average'),
        'PY_MaxBagging': FeatureBagging(combination='max'),
        'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'PY_COPOD': COPOD(),
        'PY_SOD': SOD(),
        'PY_LSCPwithLODA': LSCP([LODA(), LODA()]),
        'PY_AveLMDD': LMDD(dis_measure='aad'),
        'PY_VarLMDD': LMDD(dis_measure='var'),
        'PY_IqrLMDD': LMDD(dis_measure='iqr'),
        'PY_VAE': VAE(encoder_neurons=[8, 4, 2]),
        'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'SK_BRM': BRM(bootstrap_sample_percent=70),
        'SK_OCKRA': m_OCKRA(),
        'PY_SoGaal': SO_GAAL(),
        'PY_MoGaal': MO_GAAL()
    }
    ranker = ADRanker(data="datasets", models=models)
    ranker.get_predictions()

    # PART 2:
    # After predictions, we can evaluate our classifiers using different scores
    # You can add manually a new metric by modifying 'metrics.py'

    ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave})

    # PART 3:
    # Finally, it is time to summarize the results by plotting different graphs
    # You can add your own graphs by modifying ' plots.py'
    plot = Plots()
    plot.make_plot_basic(paths=[
        'results/scores/auc/no/results.csv',
        'results/scores/auc/minmax/results.csv',
        'results/scores/auc/std/results.csv',
        'results/scores/ave/no/results.csv',
        'results/scores/ave/minmax/results.csv',
        'results/scores/ave/std/results.csv'
    ],
                         scalers=[
                             'Without scaler', 'Min max scaler',
                             'Standard scaler', 'Without scaler',
                             'Min max scaler', 'Standard scaler'
                         ])

    plot.make_cd_plot(
        paths=[
            'results/scores/auc/minmax/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/no/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/std/results.csv',
            'results/scores/ave/std/results.csv'
        ],
        names=[
            'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale',
            'CD ave no scale', 'CD auc std scale', 'CD ave std scale'
        ],
        titles=[
            'CD diagram - AUC with min max scaling',
            'CD diagram - Average precision with min max scaling',
            'CD diagram - AUC without scaling',
            'CD diagram - Average precision without scaling',
            'CD diagram - AUC with standard scaling',
            'CD diagram - Average precision with  standard scaling'
        ])
Beispiel #27
0
pyod_umap_big_dim = EvalRun("pyod_umap_big_dim", [doc2vecwikiall],
                            [imdb_20news_3splits], [], [
                                PyodDetector(HBOS, "HBOS"),
                                PyodDetector(IForest, "iForest"),
                                PyodDetector(LOF, "LOF"),
                                PyodDetector(OCSVM, "OCSVM"),
                                PyodDetector(PCA, "PCA")
                            ])

pyod_autoencoder_test = EvalRun(
    "pyod_autoencoder_test", [doc2vecwikiall, longformer_large],
    [imdb_20news_3splits], [NoReduction()], [
        PyodDetector(VAE(epochs=30, verbosity=1), "VAE_30"),
        PyodDetector(VAE(epochs=100, verbosity=1), "VAE_100"),
        PyodDetector(AutoEncoder(epochs=30, verbose=1), "AE_30"),
        PyodDetector(AutoEncoder(epochs=100, verbose=2), "AE_100")
    ])

pyod_autoencer_refined = EvalRun(
    "pyod_autoencer_refined", [doc2vecwikiall, doc2vecapnews],
    [imdb_20news_3split_fracs], [], [
        PyodDetector(
            AutoEncoder(hidden_neurons=[32, 16, 16, 32], epochs=30, verbose=1),
            "AE_30_small"),
        PyodDetector(AutoEncoder(epochs=10, verbose=1), "AE_10"),
        PyodDetector(AutoEncoder(epochs=30, verbose=1), "AE_30"),
        PyodDetector(AutoEncoder(epochs=100, verbose=2), "AE_100")
    ])

pyod_autoencer_refined_small = EvalRun(
Beispiel #28
0
]
test_index = [item for item in list(data.index) if item not in train_index]

train = data.loc[train_index, df.columns].reset_index(drop=False)
test = data.loc[test_index, df.columns].reset_index(drop=False)
train = train.apply(pd.to_numeric)
test = test.apply(pd.to_numeric)
train_x = train.drop(columns=['user_id', 'index'])
test_x = test.drop(columns=['user_id', 'index'])

np.any(np.isnan(train_x))
np.all(np.isfinite(train_x))
train_norm = StandardScaler().fit_transform(train_x.dropna())
test_norm = StandardScaler().fit_transform(test_x.dropna())

clf1 = AutoEncoder(hidden_neurons=[25, 2, 2, 25])
clf1.fit(train_norm)

y_train_scores = clf1.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf1.predict(test_norm)  # outlier labels (0 or 1)

y_test_scores = clf1.decision_function(test_norm)  # outlier scores

y_test_pred = pd.Series(y_test_pred)
y_test_scores = pd.Series(y_test_scores)

y_test_pred.value_counts()

y_test_scores.describe()
Beispiel #29
0
x_pca = pca.fit_transform(X_train)
x_pca = pd.DataFrame(x_pca)
x_pca.columns=['PC1','PC2']

# Plot
import matplotlib.pyplot as plt
plt.scatter(X_train[0], X_train[1], c=y_train, alpha=0.8)
plt.title('Scatter plot')
plt.xlabel('x')
plt.ylabel('y')
plt.show()



# Step 1: Build the model
clf1 = AutoEncoder(hidden_neurons =[25, 2, 2, 25])
clf1.fit(X_train)

clf2 = AutoEncoder(hidden_neurons =[25, 10,2, 10, 25])
clf2.fit(X_train)

clf3 = AutoEncoder(hidden_neurons =[25, 15, 10, 2, 10,15, 25])
clf3.fit(X_train)

# Predict the anomaly scores
y_test_scores = clf1.decision_function(X_test)  
y_test_scores = pd.Series(y_test_scores)

# Step 2: Determine the cut point
import matplotlib.pyplot as plt
plt.hist(y_test_scores, bins='auto')  
Beispiel #30
0
class TestAutoEncoder(unittest.TestCase):
    def setUp(self):
        self.n_train = 6000
        self.n_test = 1000
        self.n_features = 300
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            n_features=self.n_features,
            contamination=self.contamination,
            random_state=42)

        self.clf = AutoEncoder(epochs=5, contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert (hasattr(self.clf, 'model_') and self.clf.model_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_prediction_labels_confidence(self):
        pred_labels, confidence = self.clf.predict(self.X_test,
                                                   return_confidence=True)
        assert_equal(pred_labels.shape, self.y_test.shape)
        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_prediction_proba_linear_confidence(self):
        pred_proba, confidence = self.clf.predict_proba(self.X_test,
                                                        method='linear',
                                                        return_confidence=True)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_model_clone(self):
        # for deep models this may not apply
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass