class MixtureLocalizationOutliers(object):
    def __init__(self, n_components='2'):
        self.GMM = GaussianMixture(int(n_components))
        self.LOF = LocalOutlierFactor(n_neighbors=2, novelty=True, contamination=1e-4)

        self.decisions = None

    def fit(self, X, y=None):
        self.GMM.fit(X)
        pdfs = self.GMM.score_samples(X)
        self.LOF.fit(pdfs.reshape(-1,1))
        lofs = self.LOF.decision_function(pdfs.reshape(-1,1))
        self.lower_lof, self.upper_lof = np.percentile(lofs, [.25, .75])

    def predict(self, X):
        pdfs = self.GMM.score_samples(X)
        lofs = self.LOF.decision_function(pdfs.reshape(-1,1))
        preds = []
        for pdf, lof in zip(pdfs, lofs):
            if lof <= self.lower_lof or pdf >= self.upper_lof:
                preds.append(-1)
            else:
                preds.append(1)
        self.decisions = lofs
        return preds

    def decision_function(self, X):
        if self.decisions is None:
            self.predict(X)
        return self.decisions
Example #2
0
class LOFNovelty:
    def __init__(self):
        self.clf = LocalOutlierFactor(novelty=True, contamination=0.1)
        self.scaler = StandardScaler()

    def train(self, train):
        #train = self.scaler.fit_transform(train)
        self.clf.fit(train)

    def predict(self, valid, anomaly):
        #valid = self.scaler.fit_transform(valid)
        #anomaly = self.scaler.fit_transform(anomaly)

        y_pred_valid = self.clf.predict(valid)
        y_pred_outliers = self.clf.predict(anomaly)
        score_valid = self.clf.decision_function(valid)
        score_anomaly = self.clf.decision_function(anomaly)

        print("LOF Novelty result")
        print(confusion_matrix([1] * len(y_pred_valid), y_pred_valid).ravel())
        print(
            confusion_matrix([-1] * len(y_pred_outliers),
                             y_pred_outliers).ravel())
        print(" Validation data:",
              list(y_pred_valid).count(1) / y_pred_valid.shape[0])
        #print("Score", score_valid.mean(), score_valid.std())
        print(" Outlier data:",
              list(y_pred_outliers).count(-1) / y_pred_outliers.shape[0])
    def perform_outlier_detection(self, X):
        # LOF on all features

        clf = LocalOutlierFactor(n_neighbors=20)
        clf.fit(X)
        lof_scores = clf._decision_function(X)
        lof_scores = clf._decision_function(X)

        # Isolation forest on all features
        clf = IsolationForest()
        clf.fit(X)
        forest_scores = clf.decision_function(X)
        '''
        clf = DBOD()
        clf.fit(X)
        distance_scores = clf.decision_function_distance(X)

        #abod_scores = ABOD(X, self.seed_user)
        abod_scores = clf.decision_function_angle(X)

        scores = self.combine([lof_scores, forest_scores, distance_scores, abod_scores])
        '''
        # scores = forest_scores
        scores = self.combine([lof_scores, forest_scores])
        '''
        with open('clique_expansion/' + self.seed_user + '_unnormalized_scores.csv', 'w') as f:
            for score in scores:
                f.write(str(score) + '\n')
                '''
        new_scores = scores[self.len_priors:]
        user_scores = sorted(zip(self.current_level_users, new_scores), key=lambda x: x[1], reverse=True)
        threshold = np.percentile(new_scores, 8)
        outliers = [u[0] for u in user_scores if u[1] <= threshold]
        return outliers
Example #4
0
    def test_local_outlier_factor_cdist_p3(self):
        lof = LocalOutlierFactor(n_neighbors=2, novelty=True, p=3)
        data = np.array([[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]],
                        dtype=np.float32)
        model = lof.fit(data)
        model_onnx = to_onnx(model,
                             data,
                             target_opset=TARGET_OPSET,
                             options={'optim': 'cdist'})
        self.assertIn('CDist', str(model_onnx))

        data = data.copy()
        data[:, 0] += 0.1

        try:
            sess = InferenceSession(model_onnx.SerializeToString())
        except InvalidGraph as e:
            if "Unrecognized attribute: p for operator CDist" in str(e):
                return
            raise e

        names = [o.name for o in sess.get_outputs()]
        self.assertEqual(names, ['label', 'scores'])
        got = sess.run(None, {'X': data})
        self.assertEqual(len(got), 2)
        expected_label = lof.predict(data)
        expected_decif = lof.decision_function(data)
        assert_almost_equal(expected_label, got[0].ravel())
        assert_almost_equal(expected_decif, got[1].ravel())
Example #5
0
    def test_local_outlier_factor_metric_cdist(self):
        for metric in ['euclidean', 'sqeuclidean']:
            with self.subTest(metric=metric):
                lof = LocalOutlierFactor(n_neighbors=2,
                                         novelty=True,
                                         metric=metric)
                data = np.array(
                    [[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]],
                    dtype=np.float32)
                model = lof.fit(data)
                model_onnx = to_onnx(model,
                                     data,
                                     target_opset=TARGET_OPSET,
                                     options={'optim': 'cdist'})

                data = data.copy()
                data[:, 0] += 0.1

                sess = InferenceSession(model_onnx.SerializeToString())
                names = [o.name for o in sess.get_outputs()]
                self.assertEqual(names, ['label', 'scores'])
                got = sess.run(None, {'X': data})
                self.assertEqual(len(got), 2)
                expected_label = lof.predict(data)
                expected_decif = lof.decision_function(data)
                assert_almost_equal(expected_label, got[0].ravel())
                assert_almost_equal(expected_decif, got[1].ravel(), decimal=4)
Example #6
0
class LOFNoveltyFilter (StaticFilter, _InputsStatBasedInitializable):

  def __init__(self, name = 'LOF-based novelty', sample_size = 3000, metric = 'cosine', lof_kwds = {}, **kwds):
    assert (isinstance (sample_size, int) and 1 <= sample_size)
    self.name = name
    self.sample_size = sample_size
    self.lof_threshold = 0.0
    self.lof = LocalOutlierFactor (**lof_kwds, metric = metric, novelty = True)
    super().__init__(**kwds)

  def inputs_stat_initialize (self,
                              train_data: raw_datat = None,
                              test_data: raw_datat = None):
    sample_size = min (self.sample_size, train_data.data.shape[0])
    np1 ('Initializing LOF-based novelty estimator with {} training samples... '
         .format (sample_size))
    # TODO: random sampling (& shuffle)?.
    self.lof.fit (train_data.data[:sample_size])
    c1 ('done')
    p1 ('{} offset is {}'.format (self.name, self.lof.offset_))

  def close_enough (self, i: Input):
    lof = self.lof.decision_function (i.reshape (1, -1))
    # p1 ('{}: {}'.format (self.name, lof))
    return lof > self.lof_threshold
class LocalOutlierFactor_Classifier:
  """docstring for LocalOutlierFactor_Classifier"""
  def __init__(self, save_path):

    # 默认路径
    self.save_path = os.path.join(save_path,'LocalOutlierFactor')
    if not os.path.exists(self.save_path):
      os.makedirs(self.save_path)
    self.n_neighbors=40
    # 数据集中的异常比例。当拟合时, 用于定义决策函数的阈值
    self.contamination = 0.1

    self.classifier = LocalOutlierFactor(n_neighbors=self.n_neighbors,contamination=self.contamination)

 
  def fit_model(self, train_data_matrix, test_data_matrix, test_true_label):
    """训练模型"""
    self.classifier.fit(train_data_matrix)
    y_pred_label = self.classifier.predict(test_data_matrix)
    n_errors_test = (y_pred_label!=test_true_label).sum()
    accuracy, classification_report, confusion_matrix = sklearn_evaluation(test_true_label, y_pred_label)
    print('Accuracy: {} \nClassification Report:\n{}\n'.format(accuracy, classification_report))
    sys.stdout.flush()

  def test_model(test_data,test_label):
    """测试模型
       such as test_label = [1,1,-1,....]
    """

    scores_pred = self.classifier.decision_function(train_data)
    y_pred_test = self.classifier.predict(test_data)

    n_errors = (y_pred_test!=test_label)
Example #8
0
    def perform_outlier_detection(self, X, len_priors):
        # LOF on all features
        clf = LocalOutlierFactor(n_neighbors=20)
        clf.fit(X)
        check_is_fitted(clf, ["threshold_", "negative_outlier_factor_", "n_neighbors_", "_distances_fit_X_"])
        if X is not None:
            X = check_array(X, accept_sparse='csr')
            y_pred = clf._decision_function(X)
        else:
            y_pred = clf.negative_outlier_factor_
        #lof_scores = y_pred[len_priors:]
        #lof_scores = zip(self.current_level_users, y_pred_new)
        lof_scores = y_pred

        # Isolation forest on all features
        clf = IsolationForest()
        clf.fit(X)
        y_pred = clf.decision_function(X)
        #forest_scores = y_pred[len_priors:]
        #forest_scores = zip(self.current_level_users, y_pred_new)
        forest_scores = y_pred

        scores = self.combine(lof_scores, forest_scores)
        new_scores = scores[len_priors:]
        user_scores = sorted(zip(self.current_level_users, new_scores), key=lambda x: x[1], reverse=True)
        threshold = np.percentile(new_scores, 95)
        outliers = [u[0] for u in user_scores if u[1] >= threshold]
        return outliers
def computeLocalOutlierFactor(dyResult):

    nDarrayMeanVar, min_mean, max_mean, min_var, max_var = \
        ut_data.numpyMeanVariance(dyResult["window"])

    xx, yy = np.meshgrid(np.linspace(min_mean - 100, max_mean + 100, 500),
                         np.linspace(min_var - 1000, max_var + 1000, 500))

    clf = LocalOutlierFactor(n_neighbors=15, novelty=True, contamination=0.1)
    clf.fit(nDarrayMeanVar)
    print("a")

    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.title("Novelty Detection with LOF")
    plt.contourf(xx,
                 yy,
                 Z,
                 levels=np.linspace(Z.min(), 0, 7),
                 cmap=plt.cm.PuBu)
    a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="darkred")
    plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors="palevioletred")

    s = 40
    b1 = plt.scatter(nDarrayMeanVar[:, 0],
                     nDarrayMeanVar[:, 1],
                     c="white",
                     s=s,
                     edgecolors="k")
    # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="blueviolet", s=s, edgecolors="k")
    # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="gold", s=s, edgecolors="k")
    plt.axis("tight")
    plt.xlim((min_mean, max_mean))
    plt.ylim((min_var, max_var))
    plt.legend(
        [a.collections[0], b1],
        [
            "learned frontier", "training observations"
            # ,
            # "new regular observations",
            # "new abnormal observations",
        ],
        loc="upper left",
        prop=matplotlib.font_manager.FontProperties(size=11),
    )
    # plt.xlabel(
    #     "errors novel regular: %d/40 ; errors novel abnormal: %d/40"
    #     % (n_error_test, n_error_outliers)
    # )
    plt.show()
Example #10
0
    def test_local_outlier_factor_double(self):
        lof = LocalOutlierFactor(n_neighbors=2, novelty=True)
        data = np.array([[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]],
                        dtype=np.float64)
        model = lof.fit(data)
        model_onnx = to_onnx(model, data, target_opset=TARGET_OPSET)

        sess = InferenceSession(model_onnx.SerializeToString())
        names = [o.name for o in sess.get_outputs()]
        self.assertEqual(names, ['label', 'scores'])
        got = sess.run(None, {'X': data})
        self.assertEqual(len(got), 2)
        expected_label = lof.predict(data)
        expected_decif = lof.decision_function(data)
        assert_almost_equal(expected_label, got[0].ravel())
        assert_almost_equal(expected_decif, got[1].ravel())
Example #11
0
def density_contour(
    ax,
    data,
    x,
    y,
    groupby=None,
    c="lightgray",
    single_contour_pad=1,
    linewidth=1,
    palette=None,
):
    _data = data.copy()

    if groupby is not None:
        if isinstance(groupby, str):
            _data["groupby"] = data[groupby]
        else:
            _data["groupby"] = groupby
    else:
        _data["groupby"] = "one group"

    _contour_kws = dict(
        linewidths=linewidth, levels=(-single_contour_pad,), linestyles="dashed"
    )
    _lof_kws = dict(n_neighbors=25, novelty=True, contamination="auto")

    xmin, ymin = _data[[x, y]].min()
    xmax, ymax = _data[[x, y]].max()
    xmin, xmax = zoom_min_max(xmin, xmax, 1.2)
    ymin, ymax = zoom_min_max(ymin, ymax, 1.2)

    for group, sub_data in _data[[x, y, "groupby"]].groupby("groupby"):
        xx, yy = np.meshgrid(np.linspace(xmin, xmax, 500), np.linspace(ymin, ymax, 500))
        clf = LocalOutlierFactor(**_lof_kws)
        clf.fit(sub_data.iloc[:, :2].values)
        z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        z = z.reshape(xx.shape)
        if palette is None:
            _color = c
        else:
            _color = palette[group] if group in palette else c

        # plot contour line(s)
        ax.contour(xx, yy, z, colors=_color, **_contour_kws)
    return
Example #12
0
    def test_local_outlier_factor_rnd(self):
        lof = LocalOutlierFactor(n_neighbors=2, novelty=True)
        rs = np.random.RandomState(0)
        data = rs.randn(100, 4).astype(np.float32)
        data[-1, 2:] = 99.
        data[-2, :2] = -99.
        model = lof.fit(data)
        model_onnx = to_onnx(model, data, target_opset=TARGET_OPSET)

        sess = InferenceSession(model_onnx.SerializeToString())
        names = [o.name for o in sess.get_outputs()]
        self.assertEqual(names, ['label', 'scores'])
        got = sess.run(None, {'X': data})
        self.assertEqual(len(got), 2)
        expected_label = lof.predict(data)
        expected_decif = lof.decision_function(data)
        assert_almost_equal(expected_label, got[0].ravel())
        assert_almost_equal(expected_decif, got[1].ravel(), decimal=5)
    def compute_values(df, classes, **kwargs):

        if "alfa" in kwargs:
            alfa = float(kwargs["alfa"])
            del kwargs["alfa"]
        else:
            alfa = 0.75

        if "beta" in kwargs:
            beta = float(kwargs["beta"])
            del kwargs["beta"]
        else:
            beta = 0.25

        _, cls_num = np.unique(classes, return_inverse=True)
        clss = cls_num.astype(int)

        cls_indices = {}
        noncls_indices = {}
        for cls in np.unique(clss):
            cls_indices[cls] = [i for i in range(len(df)) if clss[i] == cls]
            noncls_indices[cls] = [i for i in range(len(df)) if clss[i] != cls]

        lof = LocalOutlierFactor(**kwargs)
        lof.fit(df.values)
        lofn = LocalOutlierFactor(**kwargs, novelty=True)

        same_lof = np.empty(len(df))
        other_lof = np.empty(len(df))
        all_lof = lof.negative_outlier_factor_
        for cls in np.unique(clss):
            ind = cls_indices[cls]
            nind = noncls_indices[cls]
            lof.fit(df.iloc[ind])
            same_lof[ind] = lof.negative_outlier_factor_
            lofn.fit(df.iloc[nind])
            for i in ind:
                v = lofn.decision_function([df.iloc[i]])
                other_lof[i] = 1 / v if v != 0 else 10

        values = -1 * (same_lof + alfa * other_lof + beta * all_lof)

        return values
def perform_local_outlier_factor_novelty_detection(data):
    ''' With the five patterns' counts, this method performs Local Outlier Factor that computes
    the local density deviation of a given data point with respect to its neighbors.
    
    The experimentation is performed with different time chunks and number of sequences. '''

    # Importing necessary libraries
    from sklearn.neighbors import LocalOutlierFactor
    from sklearn.model_selection import train_test_split

    X = data.iloc[:, 0:4].values
    pca = PCA(n_components=2)
    X = pca.fit(StandardScaler().fit_transform(X)).transform(
        StandardScaler().fit_transform(X))

    # Spliting the observations into 75% training and 25% testing
    X_train, X_test = train_test_split(X, test_size=0.25, random_state=42)

    # Local Outlier Factor classifier intialization and generate results
    classifier = LocalOutlierFactor(n_neighbors=20,
                                    novelty=True,
                                    contamination=0.1)
    classifier.fit(X_train)
    Y_pred_train = classifier.predict(X_train)
    Y_pred_test = classifier.predict(X_test)
    n_error_train = Y_pred_train[Y_pred_train == -1].size
    n_error_test = Y_pred_test[Y_pred_test == -1].size
    error_train = n_error_train / Y_pred_train.shape[0] * 100
    error_novel = n_error_test / Y_pred_test.shape[0] * 100

    # Visualization
    plt.clf()
    myFig = plt.figure(figsize=[10, 8])
    xx, yy = np.meshgrid(np.linspace(-3, 8, 500), np.linspace(-2.5, 4, 500))
    Z = classifier.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx,
                 yy,
                 Z,
                 levels=np.linspace(Z.min(), 0, 7),
                 cmap=plt.cm.PuBu)
    a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
    plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')
    s = 60
    b1 = plt.scatter(X_train[:, 0],
                     X_train[:, 1],
                     c='white',
                     s=s,
                     edgecolors='k')
    b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='gold', s=s, edgecolors='k')
    plt.axis('tight')
    plt.legend([a.collections[0], b1, b2], [
        "Learned Frontier", "Training Observations", "New Regular Observations"
    ],
               loc="best",
               prop=matplotlib.font_manager.FontProperties(size=14))
    plt.xlabel("Error Train: %.2f%% and Error Novel Regular: %.2f%%" %
               (error_train, error_novel),
               fontsize=13,
               weight="bold")
    plt.yticks(fontsize=14)
    plt.xticks(fontsize=14)
    plt.title(
        'Novelty Detection using Local Outlier Factor of Ransomware Families\'\nSequence #1, #2, #3, and #4 Counts from 15 minutes of IRP Logs',
        fontsize=14,
        weight='bold')
    plt.show()

    # Save figure
    myFig.savefig(
        'sequence_mining_analysis/Results/novelty_detection/Local_Outlier_Factor/15_mins_sequences_1_2_3_4.png',
        format='png',
        dpi=150)
    myFig.savefig(
        'sequence_mining_analysis/Results/novelty_detection/Local_Outlier_Factor/15_mins_sequences_1_2_3_4.eps',
        format='eps',
        dpi=1200)
# Generate some abnormal novel observations
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))

# fit the model for novelty detection (novelty=True)
clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1)
clf.fit(X_train)
# DO NOT use predict, decision_function and score_samples on X_train as this
# would give wrong results but only on new unseen data (not used in X_train),
# e.g. X_test, X_outliers or the meshgrid
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

# plot the learned frontier, the points, and the nearest vectors to the plane
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("Novelty Detection with LOF")
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')

s = 40
b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s,
                 edgecolors='k')
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s,
                edgecolors='k')
plt.axis('tight')
plt.xlim((-5, 5))
Example #16
0
# Generate some abnormal novel observations
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))

# fit the model for novelty detection (novelty=True)
clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1)
clf.fit(X_train)
# DO NOT use predict, decision_function and score_samples on X_train as this
# would give wrong results but only on new unseen data (not used in X_train),
# e.g. X_test, X_outliers or the meshgrid
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

# plot the learned frontier, the points, and the nearest vectors to the plane
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("Novelty Detection with LOF")
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="darkred")
plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors="palevioletred")

s = 40
b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=s, edgecolors="k")
b2 = plt.scatter(X_test[:, 0],
                 X_test[:, 1],
                 c="blueviolet",
                 s=s,
                 edgecolors="k")
c = plt.scatter(X_outliers[:, 0],
Example #17
0
offset_:偏移量用于从原始分数中获取二进制标签。negative_outlier_factor小于的观察值offset_ 被检测为异常。默认的偏移设置为-1.5(inliers score around -1),除非提供的污染参数不同于“自动”。在那种情况下,以这样的方式定义偏移量,即我们可以在训练中获得预期的异常值数量。
'''

clf = LocalOutlierFactor(novelty=True)

#训练模型
clf.fit(X_train[0:40])

#训练数据集的异常得分
print(clf.negative_outlier_factor_)

#预测数据是否是异常值,正常值返回1,异常值返回-1
print(clf.predict(mix_data))

#预测数据的异常度:LOF的值越接近1,越可能是正常样本,LOF的值越大于1,则越可能是异常样本
y_score = -clf.decision_function(mix_data)
print(y_score)

# 生成画布
fig = plt.figure()

#生成子图
ax1 = fig.add_subplot(121)
ax1.set_title("标签-异常值显示图", fontproperties=font_set)
ax1.scatter(mix_lable, -clf.decision_function(mix_data), c=mix_lable)
ax1.set_xlabel('标签', fontproperties=font_set)
ax1.set_ylabel('异常度', fontproperties=font_set)

ax2 = fig.add_subplot(122)
fpr, tpr, threshold = metrics.roc_curve(mix_lable, y_score)
auc = metrics.auc(fpr, tpr)
Example #18
0
    lim_inf = X.min(axis=0)
    lim_sup = X.max(axis=0)
    volume_support = (lim_sup - lim_inf).prod()
    t = np.arange(0, 100 / volume_support, 0.01 / volume_support)
    axis_alpha = np.arange(alpha_min, alpha_max, 0.0001)
    unif = np.random.uniform(lim_inf, lim_sup,
                             size=(n_generated, n_features))

    # fit:
    print('IsolationForest processing...')
    iforest = IsolationForest()
    iforest.fit(X_train)
    s_X_iforest = iforest.decision_function(X_train)
    print('LocalOutlierFactor processing...')
    lof.fit(X_train)
    s_X_lof = lof.decision_function(X_train)

    print('OneClassSVM processing...')
    ocsvm.fit(X_train)
    s_X_ocsvm = ocsvm.decision_function(X_train).reshape(1, -1)[0]
    
    s_unif_iforest = iforest.decision_function(unif)
    s_unif_lof = lof.decision_function(unif)
    s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0]
    plt.subplot(121)
    
    print("t ist: " ,t)
    print("t_max ist : " , t_max)
    print("volume_support ist: " , volume_support)
    print("unif ist: ", unif)
    auc_iforest, em_iforest, amax_iforest = em(t, t_max,
Example #19
0
    outlier_ratio = 0.05
    nomal = inlier_num
    data, __ = STL10_read()
    anormal = list(range(10))
    anormal.remove(nomal)
    aa = data[nomal]
    o_num = (aa.shape[0] / (1 - outlier_ratio) - aa.shape[0]) / 9
    #cut = np.shape(aa)[0]
    label = nomal * np.ones((np.shape(aa)[0], 1))
    for i in anormal:
        _ = data[i]
        index = np.random.choice(np.shape(_)[0], np.int(o_num))
        aa = np.vstack((aa, _[index]))
        label = np.vstack((label, i * np.ones((np.int(o_num), 1))))
    data = aa
    data = np.reshape(data, (-1, 96 * 96 * 3))

    clf = LocalOutlierFactor(n_neighbors=200,
                             novelty=True,
                             contamination=outlier_ratio)
    clf.fit(data)
    label_pred = clf.predict(data)
    TPR, TNR, F1 = performance(label, label_pred, nomal)

    score = -clf.decision_function(data)
    fpr, tpr, thresholds = roc_curve(np.reshape(label, [np.shape(data)[0], 1]),
                                     score,
                                     pos_label=inlier_num)
    print('auc=')
    print(1 - auc(fpr, tpr))
Example #20
0
            X_test = X[n_samples_train:, :]
            y_train = y[:n_samples_train]
            y_test = y[n_samples_train:]

            # # training only on normal data:
            # X_train = X_train[y_train == 0]
            # y_train = y_train[y_train == 0]

            print('LocalOutlierFactor processing...')
            model = LocalOutlierFactor(n_neighbors=20)
            tstart = time()
            model.fit(X_train)
            fit_time += time() - tstart
            tstart = time()

            scoring = -model.decision_function(X_test)  # the lower,the more normal
            predict_time += time() - tstart
            fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring)

            if fit_time + predict_time > max_time:
                raise TimeoutError

            f = interp1d(fpr_, tpr_)
            tpr += f(x_axis)
            tpr[0] = 0.

            precision_, recall_ = precision_recall_curve(y_test, scoring)[:2]

            # cluster: old version of scipy -> interpol1d needs sorted x_input
            arg_sorted = recall_.argsort()
            recall_ = recall_[arg_sorted]
def get_clustermodel(df_train,df_test,outliers_fraction):
    #Remove once pfunction in implemented
    #df_train = s_train
    #df_test = s_test
    
    ocsvm_max_train = 10000
    n_samples_train = df_train.shape[0]
      
     # define models:
    iforest = IsolationForest(max_samples=100, random_state=42, behaviour="new", contamination=outliers_fraction)
    lof = LocalOutlierFactor(n_neighbors=20, algorithm='auto', leaf_size=30,metric='minkowski',contamination=outliers_fraction,novelty=True)
    ocsvm = OneClassSVM(kernel='linear',gamma='auto', coef0=0.0, tol=0.001, nu=outliers_fraction, \
                shrinking=True, cache_size=500, verbose=False, max_iter=-1)
    print('end of iForest,lof and OCSVM model creation')
    
    iforest_model = iforest.fit(df_train)
    print('end of iForest model training')
    
    #Local Outlier Factor only looks at the local neighbourhood of a data point and hence cannot make predictions on out of sample data points. 
    #Hence we work directly with X_test here.
    lof_model = lof.fit(df_train) 
    print('Local Outlier Factor test model completed')
    ocsvm_model =  ocsvm.fit(df_train[:min(ocsvm_max_train, n_samples_train - 1)])
    print('end of ocsvm model training!')
    
    #Anomaly Score
    iforest_anomalyscore = iforest.decision_function(df_test)#Predicts the anomaly score
    lof_anomalyscore = lof.decision_function(df_test)
    ocsvm_anomalyscore = ocsvm.decision_function(df_test)
    print('end of models - Anomaly score!')  
    
    

    #Outliers / Anomaly data Points
    #LOF - Use the Negative Factor (Value is output in Negative so get the distcint)
    
   # lof_outlier = lof_model.predict(df_test)
    #iforest_outlier = iforest_model.predict(df_test)
    #ocsvm_outlier = ocsvm_model.predict(df_test)
    

    
    #lof_y_pred=np.array(lof_outlier) #Convert to an array
    #lof_y_pred[lof_y_pred == 1] = 0
    #lof_y_pred[lof_y_pred == -1] = 1 #Anomalous score based LOF prediction

    #iforest_y_pred=np.array(iforest_outlier) #Convert to an array
    #iforest_y_pred[iforest_y_pred == 1] = 0
   # iforest_y_pred[iforest_y_pred == -1] = 1 #Anomalous score based iForest prediction
    
    #ocsvm_y_pred=np.array(ocsvm_outlier) #Convert to an array
    #ocsvm_y_pred[ocsvm_y_pred * (-1) == -1] = 1 #Anomalous score based OCSVM prediction
    #ocsvm_y_pred[ocsvm_y_pred * (-1) == 1] = 0
    
#    iforest_y_pred=np.array(iforest_anomalyscore) #Convert to an array
#    iforest_y_pred[iforest_anomalyscore>=np.percentile(iforest_anomalyscore,99)]=1 #Anomalous score based on the 99% percentile
#    iforest_y_pred[iforest_anomalyscore<np.percentile(iforest_anomalyscore,99)]=0
    
   # ocsvm_y_pred=np.array(ocsvm_anomalyscore) #Convert to an array
    #ocsvm_y_pred[ocsvm_anomalyscore>=np.percentile(ocsvm_anomalyscore,99)]=1 #Anomalous score based on the 99% percentile
    #ocsvm_y_pred[ocsvm_anomalyscore<np.percentile(ocsvm_anomalyscore,99)]=0
    
    return iforest_model,lof_model,ocsvm_model ,iforest_anomalyscore,lof_anomalyscore,ocsvm_anomalyscore
Example #22
0
            X_test = X[n_samples_train:, :]
            y_train = y[:n_samples_train]
            y_test = y[n_samples_train:]

            # # training only on normal data:
            # X_train = X_train[y_train == 0]
            # y_train = y_train[y_train == 0]

            print('LocalOutlierFactor processing...')
            model = LocalOutlierFactor(n_neighbors=20)
            tstart = time()
            model.fit(X_train)
            fit_time += time() - tstart
            tstart = time()

            scoring = -model.decision_function(
                X_test)  # the lower,the more normal
            predict_time += time() - tstart
            fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring)

            if fit_time + predict_time > max_time:
                raise TimeoutError

            f = interp1d(fpr_, tpr_)
            tpr += f(x_axis)
            tpr[0] = 0.

            precision_, recall_ = precision_recall_curve(y_test, scoring)[:2]

            # cluster: old version of scipy -> interpol1d needs sorted x_input
            arg_sorted = recall_.argsort()
            recall_ = recall_[arg_sorted]
Example #23
0
    lim_sup = X.max(axis=0)
    volume_support = (lim_sup - lim_inf).prod()
    t = np.arange(0, 100 / volume_support, 0.01 / volume_support)
    axis_alpha = np.arange(alpha_min, alpha_max, 0.0001)
    unif = np.random.uniform(lim_inf, lim_sup,
                             size=(n_generated, n_features))

    # fit:
    print('IsolationForest processing...')
    iforest = IsolationForest()
    iforest.fit(X_train)
    s_X_iforest = iforest.decision_function(X_test)
    print('LocalOutlierFactor processing...')
    lof = LocalOutlierFactor(n_neighbors=20)
    lof.fit(X_train)
    s_X_lof = lof.decision_function(X_test)
    print('OneClassSVM processing...')
    ocsvm = OneClassSVM()
    ocsvm.fit(X_train[:min(ocsvm_max_train, n_samples_train - 1)])
    s_X_ocsvm = ocsvm.decision_function(X_test).reshape(1, -1)[0]
    s_unif_iforest = iforest.decision_function(unif)
    s_unif_lof = lof.decision_function(unif)
    s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0]
    plt.subplot(121)
    auc_iforest, em_iforest, amax_iforest = em(t, t_max,
                                               volume_support,
                                               s_unif_iforest,
                                               s_X_iforest, n_generated)

    auc_lof, em_lof, amax_lof = em(t, t_max, volume_support,
                                   s_unif_lof, s_X_lof, n_generated)
# Generate sample data
X_train, y_train, X_test, y_test = \
    generate_data(n_train=n_train,
                  n_test=n_test,
                  n_features=2,
                  contamination=contamination,
                  random_state=42)

# train LocalOutlierFactor
clf_name = 'LOF'
clf = LocalOutlierFactor(n_neighbors=3, novelty=False)
clf.fit(X_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.predict(X_train)  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_function(X_train)  # raw outlier scores

# get the prediction on the test data, cannot predict train data(normal)
clf = LocalOutlierFactor(n_neighbors=3, novelty=True)
clf.fit(X_train)
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores

# Step 2: Determine the cut point
import matplotlib.pyplot as plt
plt.hist(y_test_scores, bins='auto')
plt.title("Histogram with LOF Anomaly Scores")
plt.show()

test_scores = pd.DataFrame({'Scores': y_test_scores, 'Labels': y_test_pred})
pd.DataFrame({
Example #25
0
knn_dist_all_grids, knn_ind_grids = model_knn.kneighbors(autoscaled_x_grids)
knn_dist_grids = knn_dist_all_grids.mean(axis=1)
knn_dist_grids = knn_dist_grids.reshape(xx.shape)
# plot
plt.title('k-NN')
plt.contour(xx,
            yy,
            knn_dist_grids,
            levels=[knn_dist_threshold],
            linewidths=2,
            colors='darkred')
plt.plot(x[:, 0], x[:, 1], 'x')
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()

# LOF
model_lof = LocalOutlierFactor(n_neighbors=k,
                               novelty=True,
                               contamination=rate_of_outliers)
model_lof.fit(autoscaled_x)
lof_grids = model_lof.decision_function(autoscaled_x_grids)
lof_grids = lof_grids.reshape(xx.shape)
# plot
plt.title('LOF')
plt.contour(xx, yy, lof_grids, levels=[0], linewidths=2, colors='darkred')
plt.plot(x[:, 0], x[:, 1], 'x')
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()
Example #26
0
data_scaled_means.to_csv("../../data/data_scaled_means.csv", index=False)
##############

### Machine learning models
## Isolation Forest

ilf = IsolationForest().fit(data_scaled_means)
answerIF_proba = abs(ilf.score_samples(data_scaled_means))
answerIF_proba = pd.DataFrame({'target': answerIF_proba})
pickle.dump(ilf, open("../../data/model/IsolationForest", "wb"))

## Local Outlier Factor

lof = LocalOutlierFactor(n_neighbors=2, novelty=True)
lof.fit(data_scaled_means)
answerLOF_proba = lof.decision_function(data_scaled_means)
answerLOF_proba = 1 - ((answerLOF_proba - answerLOF_proba.min()) /
                       (answerLOF_proba.max() - answerLOF_proba.min()))
answerLOF_proba = pd.DataFrame({'target': answerLOF_proba})
pickle.dump(lof, open("../../data/model/LocalOutlierFactor", "wb"))

## Elliptic Envelope

ee = EllipticEnvelope()
ee.fit(data_scaled_means)
answerEE_proba = ee.decision_function(data_scaled_means)
answerEE_proba = 1 - (answerEE_proba - 3 * answerEE_proba.min()) * 10**12
answerEE_proba = pd.DataFrame({'target': answerEE_proba})
pickle.dump(ee, open("../../data/model/EllipticEnvelope", "wb"))

##############
Example #27
0
    AE.fit(X_train)
    ae_pred_proba = AE.predict_proba(X_test)[:, 1]
    aucs_ae_ws[r] = evaluate.AUC(ae_pred_proba, y_test)
auc_ae_ws = np.mean(aucs_ae_ws)

# --- one-class-SVM --- #
clf = svm.OneClassSVM(kernel="rbf")
clf.fit(X_train)
sklearn_score_anomalies = clf.decision_function(X_test)
original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies]
auc_svm_ws = evaluate.AUC(original_paper_score, y_test)

# --- LOF --- #
lof = LocalOutlierFactor(novelty=True)
lof.fit(X_train)
sklearn_score_anomalies = lof.decision_function(X_test)
original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies]
auc_lof_ws = evaluate.AUC(original_paper_score, y_test)

# --- LODA --- #
aucs_loda_ws = np.zeros(num_of_experiments)
for r in tqdm(range(num_of_experiments)):
    loda = LODA()
    loda.fit(X_train)
    y_pred_proba_loda = np.zeros(X_test.shape[0])
    for i in tqdm(range(X_test.shape[0])):
        loda.fit(X_test[i, :].reshape(1, -1))
        y_pred_proba_loda[i] = loda.decision_function(X_test[i, :].reshape(
            1, -1))
    aucs_loda_ws[r] = evaluate.AUC(1 - y_pred_proba_loda, y_test)
auc_loda_ws = np.mean(aucs_loda_ws)
Example #28
0
class ApplicabilityDomain():
    def __init__(self,
                 method_name='ocsvm',
                 rate_of_outliers=0.01,
                 gamma='auto',
                 nu=0.5,
                 n_neighbors=10,
                 metric='minkowski',
                 p=2):
        """
        Applicability Domain (AD)
        
        Parameters
        ----------
        method_name: str, default 'ocsvm'
            The name of method to set AD. 'knn', 'lof', or 'ocsvm'
        rate_of_outliers: float, default 0.01
            Rate of outlier samples. This is used to set threshold
        gamma : (only for 'ocsvm') float, default ’auto’
            Kernel coefficient for ‘rbf’. Current default is ‘auto’ which optimize gamma to maximize variance in Gram matrix
        nu : (only for 'ocsvm') float, default 0.5
            An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. By default 0.5 will be taken.
            https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html#sklearn.svm.OneClassSVM
        n_neighbors: (only for 'knn' and 'lof') int, default 10
            Number of neighbors to use for each query
            https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
        metric : string or callable, default ‘minkowski’
            Metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used.
            https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
        p : integer, default 2
            Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
            https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
        """

        if method_name != 'knn' and method_name != 'lof' and method_name != 'ocsvm':
            sys.exit(
                'There is no ad method named \'{0}\'. Please check the variable of method_name.'
                .format(method_name))

        self.method_name = method_name
        self.rate_of_outliers = rate_of_outliers
        self.gamma = gamma
        self.nu = nu
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.p = p

    def fit(self, x):
        """
        Applicability Domain (AD)
        
        Set AD
    
        Parameters
        ----------
        x : numpy.array or pandas.DataFrame
            m x n matrix of X-variables of training data,
            m is the number of training sammples and
            n is the number of X-variables
        """

        x = np.array(x)

        if self.method_name == 'ocsvm':
            if self.gamma == 'auto':
                ocsvm_gammas = 2**np.arange(-20, 11, dtype=float)
                variance_of_gram_matrix = []
                for index, ocsvm_gamma in enumerate(ocsvm_gammas):
                    gram_matrix = np.exp(-ocsvm_gamma *
                                         cdist(x, x, metric='seuclidean'))
                    variance_of_gram_matrix.append(gram_matrix.var(ddof=1))
                self.optimal_gamma = ocsvm_gammas[
                    variance_of_gram_matrix.index(
                        max(variance_of_gram_matrix))]
            else:
                self.optimal_gamma = self.gamma
            self.ad = OneClassSVM(kernel='rbf',
                                  gamma=self.optimal_gamma,
                                  nu=self.nu)
            self.ad.fit(x)
            ad_values = np.ndarray.flatten(self.ad.decision_function(x))

        elif self.method_name == 'knn':
            self.ad = NearestNeighbors(n_neighbors=self.n_neighbors)
            self.ad.fit(x)
            knn_dist_all, knn_ind_all = self.ad.kneighbors(None)
            ad_values = 1 / (knn_dist_all.mean(axis=1) + 1)
        elif self.method_name == 'lof':
            self.ad = LocalOutlierFactor(novelty=True,
                                         contamination=self.rate_of_outliers)
            self.ad.fit(x)
            ad_values = self.ad.negative_outlier_factor_ - self.ad.offset_

        self.offset = np.percentile(ad_values, 100 * self.rate_of_outliers)

    def predict(self, x):
        """
        Applicability Domain (AD)
        
        Predict AD-values 
    
        Parameters
        ----------
        x : numpy.array or pandas.DataFrame
            k x n matrix of X-variables of test data, which is autoscaled with training data,
            and k is the number of test samples
    
        Returns
        -------
        ad_values : numpy.array, shape (n_samples,)
            values lower than 0 means outside of AD
        """

        x = np.array(x)

        if self.method_name == 'ocsvm':
            ad_values = np.ndarray.flatten(self.ad.decision_function(x))

        elif self.method_name == 'knn':
            knn_dist_all, knn_ind_all = self.ad.kneighbors(x)
            ad_values = 1 / (knn_dist_all.mean(axis=1) + 1)
        elif self.method_name == 'lof':
            ad_values = np.ndarray.flatten(self.ad.decision_function(x))

        return ad_values - self.offset
Example #29
0
    lim_inf = X.min(axis=0)
    lim_sup = X.max(axis=0)
    volume_support = (lim_sup - lim_inf).prod()
    t = np.arange(0, 100 / volume_support, 0.01 / volume_support)
    axis_alpha = np.arange(alpha_min, alpha_max, 0.0001)
    unif = np.random.uniform(lim_inf, lim_sup, size=(n_generated, n_features))

    # fit:
    print('IsolationForest processing...')
    iforest = IsolationForest()
    iforest.fit(X_train)
    s_X_iforest = iforest.decision_function(X_test)
    print('LocalOutlierFactor processing...')
    lof = LocalOutlierFactor(n_neighbors=20)
    lof.fit(X_train)
    s_X_lof = lof.decision_function(X_test)
    print('OneClassSVM processing...')
    ocsvm = OneClassSVM()
    ocsvm.fit(X_train[:min(ocsvm_max_train, n_samples_train - 1)])
    s_X_ocsvm = ocsvm.decision_function(X_test).reshape(1, -1)[0]
    s_unif_iforest = iforest.decision_function(unif)
    s_unif_lof = lof.decision_function(unif)
    s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0]
    plt.subplot(121)
    auc_iforest, em_iforest, amax_iforest = em(t, t_max, volume_support,
                                               s_unif_iforest, s_X_iforest,
                                               n_generated)

    auc_lof, em_lof, amax_lof = em(t, t_max, volume_support, s_unif_lof,
                                   s_X_lof, n_generated)
Example #30
0
    def eval_model(self, x, criterion='distance'):
        self.r_net.eval()
        x_random = copy.deepcopy(x)
        np.random.shuffle(x_random)

        if criterion == 'distance':
            print('[INFO] Using criterion distance...')
            x = torch.FloatTensor(x)
            x_random = torch.FloatTensor(x_random)

            if self.USE_GPU:
                x = x.cuda()
                x_random = x_random.cuda()

            r_target = self.r_target_net(x)
            r_pred = self.r_net(x)
            gap_loss = torch.mean(F.mse_loss(r_pred,
                                             r_target,
                                             reduction='none'),
                                  dim=1)

            r_target_random = self.r_target_net(x_random).detach()
            r_pred_random = self.r_net(x_random)

            xy = F.normalize(r_target, p=1, dim=1) * F.normalize(
                r_target_random, p=1, dim=1)
            x_y_ = F.normalize(r_pred, p=1, dim=1) * F.normalize(
                r_pred_random, p=1, dim=1)
            pair_wise_loss = torch.mean(F.mse_loss(xy, x_y_, reduction='none'),
                                        dim=1)
            scores = gap_loss + pair_wise_loss
            return scores.data.cpu().numpy()
        elif criterion == 'lof':
            print('[INFO] Using criterion LOF...')
            x = torch.FloatTensor(x)
            if self.USE_GPU:
                x = x.cuda()
            with torch.no_grad():
                r_pred = self.r_net(x)

            representations = r_pred.cpu().numpy()

            clf = LocalOutlierFactor(novelty=True)
            clf.fit(representations)
            scores = 1 - clf.decision_function(representations)

            return scores
        elif criterion == 'iforest':
            print('[INFO] Using criterion iForest...')
            x = torch.FloatTensor(x)
            if self.USE_GPU:
                x = x.cuda()
            with torch.no_grad():
                r_pred = self.r_net(x)

            representations = r_pred.cpu().numpy()

            clf = IsolationForest()
            clf.fit(representations)
            scores = 1 - clf.decision_function(representations)

            return scores
        else:
            raise ValueError('Invalid criterion!')
Example #31
0
class Profile:
    def __init__(self, ip, train_size=100, tx_interval=-1, score_window=10):
        self.ip_addr = ip
        self.tx_interval = tx_interval
        self.train_size = train_size
        self.samples = []
        self.detector = LocalOutlierFactor(novelty=True)
        self.scaler = StandardScaler()
        self.KS_population = []
        self._updated = False
        self._last_vjits = ringwindow(15)
        self.score_window = score_window  # the averaging window used over the anomaly scores. Larger windows increase robustness bu increase detection delay too.
        self._last_scores = ringwindow(self.score_window, 1)
        self._last_labels = ringwindow(self.score_window, 1)
        self.n_packets_lost_lastprobe = 0

    def set_ip(self, ip):
        self.ip_addr = ip
        self._updated = True

    def set_tx_interval(self, value):
        self.tx_interval = value
        self._updated = True

    def set_train_size(self, value):
        self.train_size = value
        self.samples = self.samples[np.max((
            len(self.samples) - self.train_size,
            0)):]  # take top most recent samples
        if not self.inTraining():  #refit model to current samples
            self.scaler.fit(np.vstack(self.samples))
            self.detector = self.detector.fit(
                self.scaler.transform(np.vstack(self.samples)))
        self._updated = True

    def trainProgress(self):
        return np.double(len(self.samples)) / np.double(self.train_size)

    def inTraining(self):
        return len(self.samples) < self.train_size

    def process(self, raw_probe, printProgress=False):
        #check probe integrity
        n_lost_packets = np.sum(np.array(
            raw_probe[1]) == 0)  #number of those with no response
        if n_lost_packets == len(raw_probe[1]):  #all packets were lost
            return -3, self._last_labels.get_mean()
        if n_lost_packets > 0:  #some packets were lost (we can't accuralty compute the probe)
            self.n_packets_lost_lastprobe = n_lost_packets
            if n_lost_packets <= 200:  #we will still try and execute if only a few were lost
                # perform partial feature extraction
                x = self.extract_features_partial(raw_probe)

                # execute partial profile
                return self._process(x, printProgress, wasPartial=True)
            else:
                return -2, self._last_labels.get_mean()
        else:  #no packets lost:
            self.n_packets_lost_lastprobe = 0

            #perform feature extraction
            x = self.extract_features(raw_probe)

            #train/execute profile
            return self._process(x, printProgress)

    def _process(
        self,
        x,
        printProgress=False,
        wasPartial=False
    ):  #learns and then scores sample. If still in training, 0 is returned.
        if self.inTraining() and wasPartial:
            return -2, 1
        if self.inTraining():
            self.samples.append(x)
            self.samples = self.samples[np.max((
                len(self.samples) - self.train_size,
                0)):]  #take top most recent samples
            if not self.inTraining():
                self.scaler.fit(np.vstack(self.samples))
                self.detector = self.detector.fit(
                    self.scaler.transform(np.vstack(self.samples)))
            if printProgress:
                progressbar(self.train_size,
                            len(self.samples),
                            pretext="Training")
            self._updated = True
            return 1, 1.0
        else:
            if wasPartial:
                label = self.classify_sample(x)  #update scores
                label = -2
            else:
                label = self.classify_sample(x)
            score = self._last_labels.get_mean()
            return label, score

    def score_sample(self, x):
        if self.inTraining():
            return  #1.0
        else:  #model is trained
            return self._last_scores.insert_get_mean(
                self.detector.decision_function(self.scaler.transform(x))
                [0])  # * -1  # larger is more anomalous

    def classify_sample(self, x):
        if self.inTraining():
            return 1
        else:  #model is trained
            m_label = self._last_labels.insert_get_mean(
                self.detector.predict(
                    self.scaler.transform(x))[0])  #1:normal, -1:anomaly
            return -1 if m_label < 0 else 1

    def extract_features(self, raw_probe):
        tx_times = np.array(raw_probe[0])
        rx_times = np.array(raw_probe[1])
        mls_seq = np.array(raw_probe[2])

        # Feature 1: v_ie
        rtt = rx_times - tx_times
        rtt_f = np.fft.fft(rtt)
        mls_seq_f = np.fft.fft(mls_seq)
        v_ie = np.sum(np.abs(
            (rtt_f / mls_seq_f))**2) / len(rtt_f)  # total energy of impulse

        # Feature 2: v_dc
        if (mls_seq == 0).all():  # should not happen (means MLS was all zeros)
            v_dc = np.mean(rtt)
        else:
            v_dc = np.mean(rtt[
                mls_seq == 1])  # the average rtt of the largest payload pings

        # Feature 3: v_jit
        jitter = np.diff(rx_times, n=1)
        if len(self.KS_population) == 0:
            m_pv = 1
        else:
            pvs = np.zeros(len(self.KS_population))
            for i in range(len(self.KS_population)):
                pvs[i] = ks_2samp(self.KS_population[i], jitter)[0]
            m_pv = np.max(pvs)
        v_jit = 0.0 if m_pv < 0.1 else 1.0

        # update KS model
        set_size = 30
        if self.inTraining():
            if (len(self.KS_population) < set_size) or (np.random.rand() >
                                                        0.7):
                self.KS_population.append(jitter)
                self.KS_population = self.KS_population[np.max((
                    len(self.KS_population) - set_size, 0)):]
                self._updated = True
        return np.array([[v_ie, v_dc, v_jit]])

    def extract_features_partial(self, raw_probe):
        tx_times = np.array(raw_probe[0])
        rx_times = np.array(raw_probe[1])
        mls_seq = np.array(raw_probe[2])
        good = rx_times != 0
        rtt = rx_times[good] - tx_times[good]
        average_sample = np.mean(np.vstack(self.samples), axis=0)

        # Feature 1: v_ie AVERAGE (not tested)
        v_ie = average_sample[0]

        # Feature 2: v_dc
        if (mls_seq == 0).all():  # should not happen (means MLS was all zeros)
            v_dc = np.mean(rtt)
        else:
            v_dc = np.mean(
                rtt[mls_seq[good] ==
                    1])  # the average rtt of the largest payload pings

        # Feature 3: v_jit AVERAGE (not tested)
        v_jit = average_sample[2]

        return np.array([[v_ie, v_dc, v_jit]])
Example #32
0
def identify_outliers(df,algorithm=0, detailed=False):
    """Identifies outliers in multi dimension. 
    
    Dataset has to be parsed as numeric beforehand.
    """

#     df_exclude_target = df[df.columns.difference([target])] # exclude target from data
    df_exclude_target = df.iloc[:,:-1]
    df_numeric = df_exclude_target.select_dtypes(include=[np.number]) # keep only numeric type features
    total_length = len(df_numeric) # total length of the dataframe, used for computing contamination later
#     print(total_length)
    
    outliers_count = np.zeros(len(df_numeric.columns)) # number of outliers of each feature
    dict_outliers = {}
    flag = False
    df_union = pd.DataFrame()
    for i, col in enumerate(df_numeric.columns):
#         if(df_numeric[col].dtype in [np.number]): # bug! to be figured out 

        # first detect outliers in each column
        # keep only the ones that are out of +3 to -3 standard deviations in the column 'Data'.
        dict_outliers[col] = df_numeric[~(np.abs(df_numeric[col]-df_numeric[col].mean())<(3*df_numeric[col].std()))] # ~ means the other way around
        # combine all the rows containing outliers in one feature
        df_union = df_union.combine_first(dict_outliers[col])
#             print(dict_outliers[col])
        if len(dict_outliers[col]) != 0:
            outliers_count[i] = len(dict_outliers[col])
            flag = True
            if detailed:                    
                print("There are {} outliers in variable {}".format(len(dict_outliers[col]), col))
                print(dict_outliers[col][col])
                print("")
        else:
            if detailed:
                print("No outliers are detected in variable {}".format(col))
                print("")
    
    # boxplot: show outliers in each feature
    # feature scaling
    ss = StandardScaler()
    df_scaled = ss.fit_transform(df_numeric)
    df_scaled = pd.DataFrame(df_scaled, columns=df_numeric.columns)
    df_scaled.head()
    # draw box plot for numeric variables
    fig = plt.figure(figsize=(6, 4))
    fig.subplots_adjust(top=0.93, wspace=0)
    ax = sns.boxplot(data=df_scaled, palette="Set1")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
    plt.show()
    
    # Two options to estimate the propotion of outliers
    # One is to take the number of outliers in the feature containing most outliers
    # The other is to take the length of the union of rows containing outliers in any feature
#     print(outliers_count)
#     print(df_union)
#     max_outliers = max(outliers_count)
    max_outliers = len(df_union) 
#     print("max outliers number is {}".format(max_outliers))
#     if flag:
#         print("Outliers detected")
#         print("")
#     else:
#         print("No outliers detected")
#         print("")
#     plt.show()
    contamination = max_outliers / total_length
    X = np.asarray(df_numeric)

    if algorithm == 2:
        clf = svm.OneClassSVM(nu=0.95 * contamination + 0.05)
        clf.fit(X)
        y_pred = clf.predict(X)
    elif algorithm == 1:
        clf = LocalOutlierFactor(n_neighbors=20, contamination=contamination)
        y_pred = clf.fit_predict(X)
    else:
        clf = IsolationForest(contamination = contamination)
        clf.fit(X)
        y_pred = clf.predict(X)
#     print(y_pred)
    outlier_index, = np.where(y_pred == -1)
    df_outliers = df_numeric.iloc[outlier_index.tolist()]
#     print(outlier_index)
    if algorithm == 1:
        anomaly_score = y_pred # decision function only available for novelty detection for in lof
    else:
        anomaly_score = clf.decision_function(X) # p_pred: The anomaly score of the input samples. The lower, The more abnormal.
    anomaly_score = pd.DataFrame(anomaly_score, columns=['anomaly_score'])
    df_with_anomaly_score = pd.concat([df, anomaly_score], axis=1)
    df_with_anomaly_score
    df_sorted = df_with_anomaly_score.sort_values(by='anomaly_score')
    cm = sns.diverging_palette(10, 220, sep=80, n=7, as_cmap=True)
    df_styled = df_sorted.style.background_gradient(cmap=cm, subset=['anomaly_score']).apply(highlight_outlier, subset=df_sorted.columns[:-1])
#     print("*********************************************")
#     print("Outliers detected in multi dimensional space:")
#     print("*********************************************")

#     print(df_numeric.iloc[outlier_index.tolist()])
    df_pred = pd.DataFrame(y_pred, columns=['pred'])
    display(df_styled)
    return df_scaled, df_styled, df_outliers, df_pred, outliers_count