def perform_outlier_detection(self, X, len_priors):
     # LOF on all features
     scores = dict()
     for key, value in X.iteritems():
         if key == 'user':
             clf = IsolationForest()
             clf.fit(value)
             scores[key] = clf.decision_function(value)
         else:
             clf = LocalOutlierFactor(n_neighbors=20)
             clf.fit(value)
             check_is_fitted(clf, ["threshold_", "negative_outlier_factor_", "n_neighbors_", "_distances_fit_X_"])
             if value is not None:
                 value = check_array(value, accept_sparse='csr')
                 scores[key] = clf._decision_function(value)
             else:
                 scores[key] = clf.negative_outlier_factor_
     with open('clique_expansion/' + self.seed_user + '_unnormalized_scores.csv', 'w') as f:
         for domain, all_scores in scores.iteritems():
             for item in all_scores:
                 f.write(str(item) + ',')
             f.write('\n')
     combined_scores = self.combine(scores)
     scores = None
     new_scores = combined_scores[len_priors:]
     user_scores = sorted(zip(self.current_level_users, new_scores), key=lambda x: x[1], reverse=True)
     threshold = np.percentile(new_scores, 95)
     outliers = [u[0] for u in user_scores if u[1] >= threshold]
     return outliers
 def perform_outlier_detection_all_combos(self, X, len_priors):
     # LOF on all features
     scores = {'temporal': {}, 'content': {}, 'user': {}, 'network': {}}
     print "Starting anomaly detection loop"
     for key, value in X.iteritems():
         if key == 'user':
             continue
         print key
         clf = IsolationForest()
         clf.fit(value)
         scores[key]['iforest'] = clf.decision_function(value)
         print "Finished iforest"
         clf = LocalOutlierFactor(n_neighbors=20)
         clf.fit(value)
         scores[key]['lof'] = clf._decision_function(value)
     print "Finished anomaly detection loop"
     with open('clique_expansion/' + self.seed_user + '_unnormalized_scores.csv', 'w') as f:
         for domain, value in scores.iteritems():
             for type_score, all_scores in value.iteritems():
                 f.write(domain + ' ' + type_score + ',')
                 for item in all_scores:
                     f.write(str(item) + ',')
                 f.write('\n')
     combined_scores = self.combine_all(scores)
     scores = None
     new_scores = combined_scores[len_priors:]
     user_scores = sorted(zip(self.current_level_users, new_scores), key=lambda x: x[1], reverse=True)
     threshold = np.percentile(new_scores, 8)
     outliers = [u[0] for u in user_scores if u[1] <= threshold]
     return outliers
Example #3
0
def find_anomalies_with_shingles(ts,
                                 window_size=5,
                                 skip_size=None,
                                 ad_type="ifor",
                                 n_top=10,
                                 outliers_fraction=0.1):
    """ Finds anomalous regions in time series using standard unsupervised detectors

    First the time series is chopped up into windows ('shingles').
    Then, a standard anomaly detector is run.
    """
    x = w = None
    n = 0
    for x_, _, w in ts.get_shingles(window_size,
                                    skip_size=skip_size,
                                    batch_size=-1):
        x = np.reshape(x_, newshape=(x_.shape[0], -1))
        n = x.shape[0]
        logger.debug("Total instances: %d" % n)
        # logger.debug("Windows:\n%s" % str(w))

    if False:
        feature_ranges = get_sample_feature_ranges(x)
        logger.debug("feature_ranges:\n%s" % str(feature_ranges))

    scores = None
    if ad_type == "ocsvm":
        ad = svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)
        ad.fit(x)
        scores = -ad.decision_function(x).reshape((n, ))
    elif ad_type == "ifor":
        ad = IsolationForest(max_samples=256,
                             contamination=outliers_fraction,
                             random_state=None)
        ad.fit(x)
        scores = -ad.decision_function(x)
    elif ad_type == "lof":
        ad = LocalOutlierFactor(n_neighbors=35,
                                contamination=outliers_fraction)
        ad.fit(x)
        scores = -ad._decision_function(x)
    elif ad_type == "autoenc":
        n_hiddens = max(1, window_size // 2)
        ad = AutoencoderAnomalyDetector(
            n_inputs=x.shape[1],
            n_neurons=[300, n_hiddens, 300],
            normalize_scale=True,
            activations=[tf.nn.tanh, tf.nn.tanh, tf.nn.tanh, None])
        ad.fit(x)
        scores = -ad.decision_function(x)

    top_anoms = np.argsort(-scores)[0:n_top]
    logger.debug("top scores (%s):\n%s\n%s" %
                 (ad_type, str(top_anoms), str(scores[top_anoms])))

    pdfpath = "temp/timeseries/timeseries_shingles_w%d_%s.pdf" % (window_size,
                                                                  ad_type)
    dp = DataPlotter(pdfpath=pdfpath, rows=3, cols=1)
    pl = dp.get_next_plot()
    pl.set_xlim([0, ts.samples.shape[0]])
    pl.plot(np.arange(0, ts.samples.shape[0]), ts.samples, 'b-', linewidth=0.5)

    for i in top_anoms:
        if w[i] + window_size <= len(ts.samples):
            pl.plot(np.arange(w[i], w[i] + window_size),
                    ts.samples[w[i]:(w[i] + window_size)], 'r-')
    dp.close()
Example #4
0
        x_grid = np.c_[xx.ravel(), yy.ravel()]

    if ad_type == "ocsvm":
        ad = svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)
        ad.fit(x)
        scores = -ad.decision_function(x).reshape((n,))
        Z = -ad.decision_function(x_grid)
    elif ad_type == "ifor":
        ad = IsolationForest(max_samples=256, contamination=outliers_fraction, random_state=None)
        ad.fit(x)
        scores = -ad.decision_function(x)
        Z = -ad.decision_function(x_grid)
    elif ad_type == "lof":
        ad = LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction)
        ad.fit(x)
        scores = -ad._decision_function(x)
        Z = -ad._decision_function(x_grid)
    elif ad_type == "loda":
        ad = Loda(mink=100, maxk=200)
        ad.fit(x)
        scores = -ad.decision_function(x)
        Z = -ad.decision_function(x_grid)

    logger.debug("scores:\n%s" % str(list(scores)))
    top_anoms = np.argsort(-scores)[np.arange(10)]

    if args.plot:
        # plot_samples_and_lines(x, lines=None, line_colors=None, line_legends=None,
        #                        top_anoms=top_anoms, pdfpath="temp/%s_%soutlier.pdf" % (ad_type, sample_type))
        Z = Z.reshape(xx.shape)
        pdfpath = "temp/ad_%scontours_%s.pdf" % (sample_type, ad_type)
Example #5
0
        if clustering_model_type == 'SVM':
            model = svm.OneClassSVM(nu=model_param[0],
                                    kernel="rbf",
                                    gamma="auto")
            model.fit(X)
            score = model.decision_function(Z)
            score = [s[0] for s in score]
        elif clustering_model_type == 'IF':
            model = IsolationForest(max_samples=n_samples,
                                    contamination=model_param[0],
                                    random_state=rng)
            model.fit(X)
            score = model.decision_function(Z)
        elif clustering_model_type == 'LOF':
            model = LocalOutlierFactor(n_neighbors=model_param[1],
                                       contamination=model_param[0])
            model.fit_predict(X)
            score = model._decision_function(Z)

        # Save Z
        Z_with_word = pd.DataFrame(list(zip(Z.index, score)))
        Z_with_word = Z_with_word.sort_values(by=1, ascending=False)
        name = (
            dist + '/clustering' + '-' + ('50' if w2v_param == 0 else '200') +
            '-' + str(count_threshold) + '-' + a_type + '-' +
            clustering_model_type + '-' + str(model_param[0]) +
            str('' if len(model_param) == 1 else '-' + str(model_param[1])))
        print('save file. name is "' + name + '"')
        Z_with_word.to_csv(name, header=None, index=False)
def find_anomalies_with_shingles(dataset,
                                 data,
                                 window_size=5,
                                 skip_size=None,
                                 ad_type="ifor",
                                 normalize_trend=False,
                                 n_top=10,
                                 outliers_fraction=0.1,
                                 log_transform=False):
    """ Finds anomalous regions in time series using standard unsupervised detectors

    First the time series is chopped up into windows ('shingles').
    Then, a standard anomaly detector is run.
    """
    x = w = None
    n = 0
    ts_data = data

    if log_transform:
        # log-transform now since the values are positive (in context of
        # many real-world datasets line airline); otherwise, values become
        # negative after de-trending
        ts_data = log_transform_series(ts_data, eps=1.0)

    if normalize_trend:
        # remove trend from series
        ts_data = difference_series(ts_data)

    ts = TSeries(ts_data, y=None)
    for x_, _, w in ts.get_shingles(window_size,
                                    skip_size=skip_size,
                                    batch_size=-1):
        x = np.reshape(x_, newshape=(x_.shape[0], -1))
        n = x.shape[0]
        logger.debug("Total instances: %d" % n)
        # logger.debug("Windows:\n%s" % str(w))

    if False:
        feature_ranges = get_sample_feature_ranges(x)
        logger.debug("feature_ranges:\n%s" % str(feature_ranges))

    scores = None
    if ad_type == "ocsvm":
        ad = svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)
        ad.fit(x)
        scores = -ad.decision_function(x).reshape((n, ))
    elif ad_type == "ifor":
        ad = IsolationForest(max_samples=min(256, x.shape[0]),
                             contamination=outliers_fraction,
                             random_state=None)
        ad.fit(x)
        scores = -ad.decision_function(x)
    elif ad_type == "lof":
        ad = LocalOutlierFactor(n_neighbors=35,
                                contamination=outliers_fraction)
        ad.fit(x)
        scores = -ad._decision_function(x)
    elif ad_type == "autoenc":
        n_hiddens = max(1, window_size // 2)
        ad = AutoencoderAnomalyDetector(
            n_inputs=x.shape[1],
            n_neurons=[300, n_hiddens, 300],
            normalize_scale=True,
            activations=[tf.nn.tanh, tf.nn.tanh, tf.nn.tanh, None])
        ad.fit(x)
        scores = -ad.decision_function(x)

    top_anoms = np.argsort(-scores)[0:n_top]
    logger.debug("top scores (%s):\n%s\n%s" %
                 (ad_type, str(top_anoms), str(scores[top_anoms])))

    pdfpath = "temp/timeseries/timeseries_shingles_%s_w%d%s_%s.pdf" % \
              (dataset, window_size, "" if not log_transform else "_log", ad_type)
    dp = DataPlotter(pdfpath=pdfpath, rows=2, cols=1)

    # plot the timeseries anomalies with the detrended series
    pl = dp.get_next_plot()
    pl.set_xlim([0, ts.samples.shape[0]])
    pl.plot(np.arange(0, ts.samples.shape[0]), ts.samples, 'b-', linewidth=0.5)

    for i in top_anoms:
        if w[i] + window_size <= len(ts.samples):
            pl.plot(np.arange(w[i], w[i] + window_size),
                    ts.samples[w[i]:(w[i] + window_size)], 'r-')

    if normalize_trend:
        # plot the original series with anomalous windows
        pl = dp.get_next_plot()
        pl.set_xlim([0, data.shape[0]])
        pl.plot(np.arange(0, data.shape[0]), data, 'b-', linewidth=0.5)

        for i in top_anoms:
            if w[i] + window_size <= len(data):
                pl.plot(np.arange(w[i], w[i] + window_size),
                        data[w[i]:(w[i] + window_size)], 'r-')

    dp.close()