def test_score_samples():
    X_train = [[1, 1], [1, 2], [2, 1]]
    clf1 = IsolationForest(contamination=0.1).fit(X_train)
    clf2 = IsolationForest().fit(X_train)
    assert_array_equal(clf1.score_samples([[2., 2.]]),
                       clf1.decision_function([[2., 2.]]) + clf1.offset_)
    assert_array_equal(clf2.score_samples([[2., 2.]]),
                       clf2.decision_function([[2., 2.]]) + clf2.offset_)
    assert_array_equal(clf1.score_samples([[2., 2.]]),
                       clf2.score_samples([[2., 2.]]))
def test_iforest_works(contamination):
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]

    # Test IsolationForest
    clf = IsolationForest(random_state=rng, contamination=contamination)
    clf.fit(X)
    decision_func = -clf.decision_function(X)
    pred = clf.predict(X)
    # assert detect outliers:
    assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
    assert_array_equal(pred, 6 * [1] + 2 * [-1])
def outlier_removal(df, col, method, params):
    if method == 'Isolation Forest':
        do_outlier_removal = IsolationForest(**params)
    if method == 'Local Outlier Factor':
        do_outlier_removal = LocalOutlierFactor(**params)
    else:
        method == None
    do_outlier_removal.fit(np.array(df[col]))
    if method == 'Isolation Forest':
        outlier_scores = do_outlier_removal.decision_function(np.array(df[col]))
        df[('meta', 'Outlier Scores - ' + method + str(params))] = outlier_scores
        is_outlier = do_outlier_removal.predict(np.array(df[col]))
        df[('meta', 'Outliers - ' + method + str(params))] = is_outlier
    if method == 'Local Outlier Factor':
        is_outlier = do_outlier_removal.fit_predict(np.array(df[col]))
        df[('meta', 'Outliers - ' + method + str(params))] = is_outlier
        df[('meta', 'Outlier Factor - ' + method + str(params))] = do_outlier_removal.negative_outlier_factor_
    return df, do_outlier_removal
def test_iforest_performance():
    """Test Isolation Forest performs well"""

    # Generate train/test data
    rng = check_random_state(2)
    X = 0.3 * rng.randn(120, 2)
    X_train = np.r_[X + 2, X - 2]
    X_train = X[:100]

    # Generate some abnormal novel observations
    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
    X_test = np.r_[X[100:], X_outliers]
    y_test = np.array([0] * 20 + [1] * 20)

    # fit the model
    clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)

    # predict scores (the lower, the more normal)
    y_pred = - clf.decision_function(X_test)

    # check that there is at most 6 errors (false positive or false negative)
    assert_greater(roc_auc_score(y_test, y_pred), 0.98)
# Generate some regular novel observations
X = 0.3 * rng.randn(20, 2)
X_test = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))

# fit the model
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)

# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green')
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend([b1, b2, c],
           ["training observations",
            "new regular observations", "new abnormal observations"],
           loc="upper left")
Example #6
0
X_test = np.r_[X + 2, X - 2]  ##按行堆叠,shape(40,2)
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))  ##shape(20,2)

# fit the model
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(
    X_train
)  ## 训练出一个iForest,iForest为无监督的方法,但是也不能直接对无标记样本集预测,可以先fit无标记样本集,然后在predict
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)

# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(),
                                yy.ravel()])  ##按列堆叠shape(100,2),并且得出决策边界
Z = Z.reshape(xx.shape)

plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)  ##画出决策边界,不同的区域颜色不同

b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
c = plt.scatter(X_outliers[:, 0],
                X_outliers[:, 1],
                c='red',
                s=20,
                edgecolor='k')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
def Eval(clargs):    
    __version__ = '1.0'
    usage = """train_flows [options] normaldatafile"""
    parser = OptionParser(usage=usage, version=__version__)

    parser.add_option("-x", "--vectorizerfile", action="store", type="string", \
                      default='/tmp/vectorizers.pkl', help="")
    parser.add_option("-v", "--verbose", action="store_true", default=False, \
                      help="enable verbose output")
    parser.add_option("-o", "--maliciousdatafile", action="store", type="string", \
                      default=None, help="An optional file of malicious http logs")
    parser.add_option("-m", "--maxfeaturesperbag", action="store", type="int", \
                      default=100, help="maximum number of features per bag")
    parser.add_option("-g", "--ngramsize", action="store", type="int", \
                      default=7, help="ngram size")

    parser.add_option("-f", "--features", action="store", type="string", \
                      default="01000100111111111111", help="An optional file for choosing which features to be extracted")
    parser.add_option("-t", "--maxtrainingfeatures", action="store", type="int", \
                      default=50000, help="maximum number of rows to train with per class")
    parser.add_option("-n", "--numtrees", action="store", type="int", \
                      default=200, help="number of trees in isolation forest")
    parser.add_option("-s", "--numsamples", action="store", type="int", \
                      default=8192, help="number of samples in each tree")


    Start=time.time()
    (opts, args) = parser.parse_args(clargs)

    if len(args) != 2:
        parser.error('Incorrect number of arguments')

    ftu=[]
    features = opts.features

    for i, j in enumerate(features):
      if opts.verbose: print(j, all_fields[i])
      if j == 1 or j=='1':
        ftu.append(all_fields[i])

    if opts.verbose: print ftu
    #ftu = ['method', 'user_agent', 'status_code']


    # load the http data in to a data frame
    print('Loading HTTP data')
    df = load_brofile(args[0], fields_to_use)
    trainDf = load_brofile(args[1], fields_to_use)


    total_rows = len(df.index)
    if opts.verbose: print('Total number of rows: %d' % total_rows)
    if opts.maliciousdatafile != None:
      print('Reading malicious training data')
      df1 = load_brofile(opts.maliciousdatafile, fields_to_use)
      if opts.verbose: print('Read malicious data with %s rows ' % len(df1.index))
      #if (len(df1.index) > opts.maxtrainingfeatures):
      #  if opts.verbose: print('Too many malicious samples for training, downsampling to %d' % opts.maxtrainingfeatures)
      #  df1 = df1.sample(n=opts.maxtrainingfeatures)

      #set the classes of the dataframes and then stitch them together in to one big dataframe
      df['class'] = 0
      df1['class'] = 1
      classedDf = pd.concat([df,df1], ignore_index=True)
    else:
      #we weren't passed a file containing class-1 data, so we should generate some of our own.
      noiseDf = create_noise_contrast(df, numSamples)
      if opts.verbose: print('Added %s rows of generated malicious data'%numSamples)
      df['class'] = 0
      noiseDf['class'] = 1
      classedDf = pd.concat([df,noiseDf], ignore_index=True)

    #that doesn't matter
    trainDf['class']=0;


    #spliting into training and evaluation sets 
    classedDf['is_train']=False
    trainDf['is_train']=True

    enhancedDf = enhance_flow(pd.concat([trainDf,classedDf], ignore_index=True), ftu)
    # construct some vectorizers based on the data in the DF. We need to vectorize future log files the exact same way so we
    # will be saving these vectorizers to a file.

    vectorizers = build_vectorizers(enhancedDf, ftu, max_features=opts.maxfeaturesperbag, ngram_size=opts.ngramsize, verbose=opts.verbose)

    #use the vectorizers to featureize our DF into a numeric feature dataframe
    featureMatrix = featureize(enhancedDf, ftu, vectorizers, verbose=opts.verbose)

    #add the class column back in (it wasn't featurized by itself)
    featureMatrix['class'] = enhancedDf['class']
    featureMatrix['is_train'] = enhancedDf['is_train']


    #split out the train and test df's into separate objects
    train, test = featureMatrix[featureMatrix['is_train']==True], featureMatrix[featureMatrix['is_train']==False]

    #drop the is_train column, we don't need it anymore
    train = train.drop('is_train', axis=1)
    test = test.drop('is_train', axis=1)


    #print('Calculating features')


    Trees=opts.numtrees
    Samples=opts.numsamples
    clf = IsolationForest(n_estimators=Trees, max_samples=Samples)

    
    clf.fit(train.drop('class', axis=1))

    testnoclass = test.drop('class', axis=1)

    print('Predicting')

    test.is_copy = False

    test['prediction'] = clf.decision_function(testnoclass) + 0.5

    print('Analyzing')
    #get the class-1 (outlier/anomaly) rows from the feature matrix, and drop the prediction so we can investigate them

    ##From Here
    Left=0.001 
    Right=0.01
    
    fpr, tpr, thresholds = roc_curve(test['class'], test['prediction'], pos_label=0)
    
    F=interpolate.interp1d(fpr, tpr, assume_sorted=True)
    x=np.logspace(np.log10(Left), np.log10(Right))
    y=F(x)
    roc_auc=auc(x, y)

    plt.figure()
    plt.xscale('log')

    plt.plot(fpr, tpr, color='b')
    plt.plot(x,y, color='r')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')


    plt.plot(plt.xlim(), plt.ylim(), ls="--", c=".3")
    plt.savefig("fig3.png")
    plt.clf()
    plt.close('all')


    print('Area Under the Curve = %.6f' %(roc_auc))



    Min, Sec= divmod( int(time.time() - Start), 60 )
    #print Min, Sec

    target= open('Results.txt', 'a')
    target.write(str(Trees)+' ')
    target.write(str(Samples)+' ')
    target.write(str(Min)+' ')
    target.write(str(Sec)+' ')
    target.write(str(roc_auc))
    target.write("\n")
    target.write(str(features))
    target.write("\n")
    target.write("\n")
    target.close()

    
    print("Minutes: %d, Seconds: %d" % (int(Min), int(Sec)) )
    return roc_auc 
Example #8
0
class IForest(BaseDetector):
    """Wrapper of scikit-learn Isolation Forest with more functionalities.

    The IsolationForest 'isolates' observations by randomly selecting a
    feature and then randomly selecting a split value between the maximum and
    minimum values of the selected feature.
    See :cite:`liu2008isolation,liu2012isolation` for details.

    Since recursive partitioning can be represented by a tree structure, the
    number of splittings required to isolate a sample is equivalent to the path
    length from the root node to the terminating node.

    This path length, averaged over a forest of such random trees, is a
    measure of normality and our decision function.

    Random partitioning produces noticeably shorter paths for anomalies.
    Hence, when a forest of random trees collectively produce shorter path
    lengths for particular samples, they are highly likely to be anomalies.

    :param n_estimators: The number of base estimators in the ensemble.
    :type n_estimators: int, optional (default=100)

    :param max_samples: The number of samples to draw from X to train
        each base estimator.

            - If int, then draw `max_samples` samples.
            - If float, then draw `max_samples * X.shape[0]` samples.
            - If "auto", then `max_samples=min(256, n_samples)`.
    :type max_samples: int or float, optional (default="auto")

    :param contamination: The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.
    :type contamination: float in (0., 0.5), optional (default=0.1)

    :param max_features: The number of features to draw from X to
        train each base estimator.

            - If int, then draw `max_features` features.
            - If float, then draw `max_features * X.shape[1]` features.
    :type max_features: int or float, optional (default=1.0)

    :param bootstrap: If True, individual trees are fit on random subsets of
        the training data sampled with replacement. If False, sampling without
        replacement is performed.
    :type bootstrap: bool, optional (default=False)

    :param n_jobs: The number of jobs to run in parallel for both `fit` and
        `predict`. If -1, then the number of jobs is set to the number of cores
    :type n_jobs: int, optional (default=1)

    :param random_state: If int, random_state is the seed used by the random
        number generator; If RandomState instance, random_state is the random
        number generator; If None, the random number generator is the
        RandomState instance used by `np.random`.
    :type random_state: int, RandomState instance or None, optional
        (default=None)

    :param verbose: Controls the verbosity of the tree building process.
    :type verbose: int, optional (default=0)

    :var estimators\_: The collection of fitted sub-estimators.
    :vartype estimators\_: list

    :var estimators_samples\_: The subset of drawn samples (i.e., the
        in-bag samples) for each base estimator.
    :vartype estimators_samples\_: list or arrays

    :var max_samples\_: The actual number of samples.
    :vartype max_samples\_: int
    """
    def __init__(self,
                 n_estimators=100,
                 max_samples="auto",
                 contamination=0.1,
                 max_features=1.,
                 bootstrap=False,
                 n_jobs=1,
                 random_state=None,
                 verbose=0):
        super(IForest, self).__init__(contamination=contamination)
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y=None):
        # Validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = IsolationForest(n_estimators=self.n_estimators,
                                         max_samples=self.max_samples,
                                         contamination=self.contamination,
                                         max_features=self.max_features,
                                         bootstrap=self.bootstrap,
                                         n_jobs=self.n_jobs,
                                         random_state=self.random_state,
                                         verbose=self.verbose)
        self.detector_.fit(X=X, y=None, sample_weight=None)

        # invert decision_scores_. Outliers comes with higher outlier scores
        self.decision_scores_ = self.detector_.decision_function(X) * -1
        self._process_decision_scores()
        return self

    def decision_function(self, X):
        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        # invert decision_scores_. Outliers comes with higher outlier scores
        return self.detector_.decision_function(X) * -1

    @property
    def estimators_(self):
        """The collection of fitted sub-estimators.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.estimators_

    @property
    def estimators_samples_(self):
        """The subset of drawn samples (i.e., the in-bag samples) for
        each base estimator.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.estimators_samples_

    @property
    def max_samples_(self):
        """The actual number of samples.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.max_samples_
Example #9
0
class IForest(BaseDetector):
    """Wrapper of scikit-learn Isolation Forest with more functionalities.

    The IsolationForest 'isolates' observations by randomly selecting a
    feature and then randomly selecting a split value between the maximum and
    minimum values of the selected feature.
    See :cite:`liu2008isolation,liu2012isolation` for details.

    Since recursive partitioning can be represented by a tree structure, the
    number of splittings required to isolate a sample is equivalent to the path
    length from the root node to the terminating node.

    This path length, averaged over a forest of such random trees, is a
    measure of normality and our decision function.

    Random partitioning produces noticeably shorter paths for anomalies.
    Hence, when a forest of random trees collectively produce shorter path
    lengths for particular samples, they are highly likely to be anomalies.

    Parameters
    ----------
    n_estimators : int, optional (default=100)
        The number of base estimators in the ensemble.

    max_samples : int or float, optional (default="auto")
        The number of samples to draw from X to train each base estimator.

            - If int, then draw `max_samples` samples.
            - If float, then draw `max_samples * X.shape[0]` samples.
            - If "auto", then `max_samples=min(256, n_samples)`.

        If max_samples is larger than the number of samples provided,
        all samples will be used for all trees (no sampling).

    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set, i.e. the proportion
        of outliers in the data set. Used when fitting to define the threshold
        on the decision function.

    max_features : int or float, optional (default=1.0)
        The number of features to draw from X to train each base estimator.

            - If int, then draw `max_features` features.
            - If float, then draw `max_features * X.shape[1]` features.

    bootstrap : bool, optional (default=False)
        If True, individual trees are fit on random subsets of the training
        data sampled with replacement. If False, sampling without replacement
        is performed.

    n_jobs : integer, optional (default=1)
        The number of jobs to run in parallel for both `fit` and `predict`.
        If -1, then the number of jobs is set to the number of cores.

    behaviour : str, default='old'
        Behaviour of the ``decision_function`` which can be either 'old' or
        'new'. Passing ``behaviour='new'`` makes the ``decision_function``
        change to match other anomaly detection algorithm API which will be
        the default behaviour in the future. As explained in details in the
        ``offset_`` attribute documentation, the ``decision_function`` becomes
        dependent on the contamination parameter, in such a way that 0 becomes
        its natural threshold to detect outliers.

        .. versionadded:: 0.7.0
           ``behaviour`` is added in 0.7.0 for back-compatibility purpose.

        .. deprecated:: 0.20
           ``behaviour='old'`` is deprecated in sklearn 0.20 and will not be
           possible in 0.22.

        .. deprecated:: 0.22
           ``behaviour`` parameter will be deprecated in sklearn 0.22 and
           removed in 0.24.

        .. warning::
            Only applicable for sklearn 0.20 above.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    verbose : int, optional (default=0)
        Controls the verbosity of the tree building process.

    Attributes
    ----------
    estimators_ : list of DecisionTreeClassifier
        The collection of fitted sub-estimators.

    estimators_samples_ : list of arrays
        The subset of drawn samples (i.e., the in-bag samples) for each base
        estimator.

    max_samples_ : integer
        The actual number of samples

    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """
    def __init__(self,
                 n_estimators=100,
                 max_samples="auto",
                 contamination=0.1,
                 max_features=1.,
                 bootstrap=False,
                 n_jobs=1,
                 behaviour='old',
                 random_state=None,
                 verbose=0):
        super(IForest, self).__init__(contamination=contamination)
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.n_jobs = n_jobs
        self.behaviour = behaviour
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        # In sklearn 0.20+ new behaviour is added (arg behaviour={'new','old'})
        # to IsolationForest that shifts the location of the anomaly scores
        # noinspection PyProtectedMember
        sklearn_version = _get_sklearn_version()
        if sklearn_version == 21:
            self.detector_ = IsolationForest(n_estimators=self.n_estimators,
                                             max_samples=self.max_samples,
                                             contamination=self.contamination,
                                             max_features=self.max_features,
                                             bootstrap=self.bootstrap,
                                             n_jobs=self.n_jobs,
                                             behaviour=self.behaviour,
                                             random_state=self.random_state,
                                             verbose=self.verbose)

        # Do not pass behaviour argument when sklearn version is < 0.20 or >0.21
        else:  # pragma: no cover
            self.detector_ = IsolationForest(n_estimators=self.n_estimators,
                                             max_samples=self.max_samples,
                                             contamination=self.contamination,
                                             max_features=self.max_features,
                                             bootstrap=self.bootstrap,
                                             n_jobs=self.n_jobs,
                                             random_state=self.random_state,
                                             verbose=self.verbose)

        self.detector_.fit(X=X, y=None, sample_weight=None)

        # invert decision_scores_. Outliers comes with higher outlier scores.
        self.decision_scores_ = invert_order(
            self.detector_.decision_function(X))
        self._process_decision_scores()
        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        # invert outlier scores. Outliers comes with higher outlier scores
        return invert_order(self.detector_.decision_function(X))

    @property
    def estimators_(self):
        """The collection of fitted sub-estimators.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.estimators_

    @property
    def estimators_samples_(self):
        """The subset of drawn samples (i.e., the in-bag samples) for
        each base estimator.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.estimators_samples_

    @property
    def max_samples_(self):
        """The actual number of samples.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.max_samples_

    @property
    def feature_importances_(self):
        """The impurity-based feature importance. The higher, the more
        important the feature. The importance of a feature is computed as the
        (normalized) total reduction of the criterion brought by that feature.
        It is also known as the Gini importance.

        .. warning::
        impurity-based feature importance can be misleading for
        high cardinality features (many unique values). See
        https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html
        as an alternative.

        Returns
        -------
        feature_importances_ : ndarray of shape (n_features,)
            The values of this array sum to 1, unless all trees are single node
            trees consisting of only the root node, in which case it will be an
            array of zeros.
        """
        check_is_fitted(self)
        all_importances = Parallel(n_jobs=self.n_jobs)(
            delayed(getattr)(tree, "feature_importances_")
            for tree in self.detector_.estimators_
            if tree.tree_.node_count > 1)

        if not all_importances:
            return np.zeros(self.n_features_in_, dtype=np.float64)

        all_importances = np.mean(all_importances, axis=0, dtype=np.float64)
        return all_importances / np.sum(all_importances)
# Generate sample data
X_train, y_train, X_test, y_test = \
    generate_data(n_train=n_train,
                  n_test=n_test,
                  n_features=2,
                  contamination=contamination,
                  random_state=42)

# train IsolationForest
clf_name = 'IF'
clf = IsolationForest(random_state=0)
clf.fit(X_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.predict(X_train)  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_function(X_train)  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores

# Step 2: Determine the cut point
import matplotlib.pyplot as plt
plt.hist(y_test_scores, bins='auto')
plt.title("Histogram with IF Anomaly Scores")
plt.show()

test_scores = pd.DataFrame({'Scores': y_test_scores, 'Labels': y_test_pred})
pd.DataFrame({
    'Outliers': test_scores.groupby('Labels').get_group(-1).Scores,
    'Inlierss': test_scores.groupby('Labels').get_group(1).Scores
Example #11
0
class IForest(RandomSplitForest):
    def __init__(self,
                 n_estimators=100,
                 max_samples="auto",
                 contamination=0.1,
                 max_features=1.,
                 bootstrap=False,
                 n_jobs=1,
                 replace_frac=0.2,
                 random_state=None,
                 verbose=0):
        RandomSplitForest.__init__(self, n_estimators=n_estimators,
                                   max_samples=max_samples,
                                   max_features=max_features,
                                   bootstrap=bootstrap,
                                   n_jobs=n_jobs,
                                   random_state=random_state,
                                   verbose=verbose)
        self.contamination = contamination
        # The fraction of trees replaced when new window of data arrives
        self.replace_frac = replace_frac
        self.ifor = None
        self.estimators_features_ = None
        self.buffer = None
        self.updated = False

    def fit(self, X, y=None, sample_weight=None):
        self.ifor = IsolationForest(n_estimators=self.n_estimators,
                                    max_samples=self.max_samples,
                                    contamination=self.contamination,
                                    max_features=self.max_features,
                                    bootstrap=self.bootstrap,
                                    n_jobs=self.n_jobs,
                                    random_state=self.random_state,
                                    verbose=self.verbose)
        self.ifor.fit(X, y, sample_weight)
        self.estimators_ = self.ifor.estimators_
        self.estimators_features_ = self.ifor.estimators_features_
        self.updated = False

    def _fit(self, X, y, max_samples, max_depth, sample_weight=None):
        raise NotImplementedError("method _fit() not supported")

    def decision_function(self, X):
        if self.updated:
            logger.debug("WARN: The underlying isolation forest was updated and " +
                         "using calling decision_function() on it will likely return inconsistent results.")
        return self.ifor.decision_function(X)

    def supports_streaming(self):
        return True

    def add_samples(self, X, current=True):
        if current:
            raise ValueError("IForest does not support adding to current instance set.")
        if self.buffer is None:
            self.buffer = X
        else:
            self.buffer = np.vstack([self.buffer, X])

    def update_trees_by_replacement(self, X=None, replace_trees=None):
        if X is None:
            X = self.buffer
        if X is None:
            logger.warning("No new data for update")
            return None

        if replace_trees is not None:
            replace_set = set(replace_trees)
            n_new_trees = len(replace_set)
            if n_new_trees < 0:
                raise ValueError("Replacement set is larger than allowed")
            old_tree_indexes_replaced = replace_trees
            old_tree_indexes_retained = np.array([i for i in range(len(self.estimators_)) if i not in replace_set], dtype=int)
        else:
            n_new_trees = int(self.replace_frac * len(self.estimators_))
            old_tree_indexes_replaced = np.arange(0, n_new_trees, dtype=int)
            old_tree_indexes_retained = np.arange(n_new_trees, len(self.estimators_))

        if n_new_trees > 0:
            new_ifor = IsolationForest(n_estimators=n_new_trees,
                                       max_samples=self.max_samples,
                                       contamination=self.contamination,
                                       max_features=self.max_features,
                                       bootstrap=self.bootstrap,
                                       n_jobs=self.n_jobs,
                                       random_state=self.random_state,
                                       verbose=self.verbose)
            new_ifor.fit(X, y=None, sample_weight=None)

            # retain estimators and features
            self.estimators_ = [self.estimators_[i] for i in old_tree_indexes_retained]
            self.estimators_features_ = [self.estimators_features_[i] for i in old_tree_indexes_retained]
            # append the new trees at the end of the list of older trees
            for estimator, features in zip(new_ifor.estimators_, new_ifor.estimators_features_):
                self.estimators_.append(estimator)
                self.estimators_features_.append(features)

            # Now, update the underlying isolation forest
            # NOTE: This might make the model inconsistent
            self.ifor.estimators_ = self.estimators_
            self.ifor.estimators_features_ = self.estimators_features_

            new_estimators = new_ifor.estimators_
        else:
            new_estimators = None

        self.updated = True
        self.buffer = None

        if False:
            logger.debug("IForest update_trees_by_replacement(): n_new_trees: %d, samples: %s" %
                         (n_new_trees, str(X.shape)))

        # we return lists in order to support feature groups in multiview forest (see IForestMultiview)
        return [old_tree_indexes_replaced], [old_tree_indexes_retained], [new_estimators]

    def update_model_from_stream_buffer(self, replace_trees=None):
        return self.update_trees_by_replacement(self.buffer)
Example #12
0
# Generate some regular novel observations
X = 0.3 * rng.randn(20, 2)
X_test = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))

# fit the model
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)

# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
c = plt.scatter(X_outliers[:, 0],
                X_outliers[:, 1],
                c='red',
                s=20,
                edgecolor='k')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
Example #13
0
    index += 1

print(x_train.shape)
X_train = x_train

rng = np.random.RandomState(42)
isofortrain = IsolationForest(n_estimators=1000,
                              max_samples='auto',
                              contamination=.20,
                              max_features=1,
                              random_state=rng,
                              n_jobs=-1)

isofortrain.fit(X_train)
anomalytrain = isofortrain.decision_function(X_train)
predicttrain = isofortrain.predict(X_train)

len_predictrain = len(predicttrain)
print("len_predictrain", len_predictrain)

num_iforest_diff = 0

for i in predicttrain:
    if i == -1:
        num_iforest_diff += 1

print("num_iforest_diff", num_iforest_diff)

same = 0
index = 0
print(x_value.shape)

# Print shapes
print(x_value.shape)
print(y_value.shape)

#Algorithms used: Random Isolation, LocalOutlier factor are common  anomaly detection methods
random_isolation = IsolationForest(max_samples=len(x_value),
                                   contamination=outlier_value,
                                   random_state=3)
local_outlier = LocalOutlierFactor(n_neighbors=12, contamination=outlier_value)

n_outlier = len(fraudal_count)
#fit and predict
random_isolation.fit(x_value)
score_prediction = random_isolation.decision_function(x_value)
y_predict_lof = random_isolation.predict(x_value)

y_predict_isf = local_outlier.fit_predict(x_value)
score_prediction = local_outlier.negative_outlier_factor_

#Change the value to 0 for valid and 1 for fradual cases.
y_predict_isf[y_predict_isf == 1] = 0
y_predict_isf[y_predict_isf == -1] = 1
y_predict_lof[y_predict_lof == 1] = 0
y_predict_lof[y_predict_lof == -1] = 1

n_error_isf = (y_predict_isf != y_value).sum()
n_error_lof = (y_predict_lof != y_value).sum()
print("Error value for Isolation forest ", n_error_isf)
print("Error value for local outlier function ", n_error_lof)
Example #15
0
    #ocsvm = OneClassSVM(kernel='linear', degree=2, gamma='auto', nu=0.5)
    ocsvm = OneClassSVM(gamma='auto', nu=0.01)

    lim_inf = X.min(axis=0)
    lim_sup = X.max(axis=0)
    volume_support = (lim_sup - lim_inf).prod()
    t = np.arange(0, 100 / volume_support, 0.01 / volume_support)
    axis_alpha = np.arange(alpha_min, alpha_max, 0.0001)
    unif = np.random.uniform(lim_inf, lim_sup,
                             size=(n_generated, n_features))

    # fit:
    print('IsolationForest processing...')
    iforest = IsolationForest()
    iforest.fit(X_train)
    s_X_iforest = iforest.decision_function(X_train)
    print('LocalOutlierFactor processing...')
    lof.fit(X_train)
    s_X_lof = lof.decision_function(X_train)

    print('OneClassSVM processing...')
    ocsvm.fit(X_train)
    s_X_ocsvm = ocsvm.decision_function(X_train).reshape(1, -1)[0]
    
    s_unif_iforest = iforest.decision_function(unif)
    s_unif_lof = lof.decision_function(unif)
    s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0]
    plt.subplot(121)
    
    print("t ist: " ,t)
    print("t_max ist : " , t_max)
Example #16
0
    def detect(self):
        '''
        利用孤立森林 isolation forest 进行离群点检测
        '''
        # 获得预处理之后的数据
        data = self.preprocess()

        # 异常点检测
        # 创建 IsolationForest
        ilf = IsolationForest(
            n_estimators=self.n_estimators,
            n_jobs=-1,  # 使用全部cpu
            verbose=self.verbose,
            contamination=self.contamination,  # 离群点的比例
        )

        # 是否保存/加载模型的控制流
        if self.isSaveModel and self.isLoadModel:
            # isSaveModel = True & isLoadModel = True
            # 训练并保存模型到本地,然后继续预测
            # 训练
            print('Model training...')
            ilf.fit(data)
            # 保存模型到本地
            print('Saving model to `%s`...' % self.modelname)
            with open(self.modelname, 'wb') as fp:
                pickle.dump(ilf, fp)
        elif self.isSaveModel:
            # isSaveModel = True & isLoadModel = False
            # 训练并保存模型到本地,然后不再继续预测
            # 训练
            print('Model training...')
            ilf.fit(data)
            # 保存模型到本地
            print('Saving model to `%s`...' % self.modelname)
            with open(self.modelname, 'wb') as fp:
                pickle.dump(ilf, fp)
            print('Don\'t predict.')
            return
        elif self.isLoadModel:
            # isSaveModel = False & isLoadModel = True
            # 直接加载本地模型,然后继续预测
            # 加载本地模型
            print('Loading model from `%s`...' % self.modelname)
            with open(self.modelname, 'rb') as fp:
                ilf = pickle.load(fp)
        else:
            # isSaveModel = False & isLoadModel = False
            # 只训练不保存模型,然后继续预测
            # 训练
            print('Model training...')
            ilf.fit(data)

        # 预测
        print('Outliers predicting...')
        shape = data.shape[0]
        all_pred = []
        all_score = []
        for i in range(int(shape / self.batch) + 1):
            start = i * self.batch
            end = (i + 1) * self.batch
            batch_test = data[start:end]
            # 预测
            # 返回值:+1 表示正常样本, -1表示异常样本
            pred = ilf.predict(batch_test)
            # 返回样本的异常评分。 值越小表示越有可能是异常样本
            score = ilf.decision_function(batch_test)
            all_pred.extend(pred)
            all_score.extend(score)

        data['timestamp'] = self.origin_data['timestamp']
        data['is_outlier'] = all_pred
        data['outlier_score'] = all_score

        # 转换输出列值
        data['timestamp'] = data['timestamp'].astype('int64')
        data.loc[data.is_outlier == 1, 'is_outlier'] = 0
        data.loc[data.is_outlier == -1, 'is_outlier'] = 1

        print('Writing `%s`...' % self.output_filename)
        data.to_csv(self.output_filename,
                    columns=['timestamp', 'outlier_score', 'is_outlier'],
                    header=True,
                    index=0)
Example #17
0
pp = PdfPages(plotfolder + 'scatterplots.pdf')
for j, features in enumerate(feature_pairs):
    X, Y = features[0], features[1]
    print j, 'of', len(feature_pairs)
    pair_features = np.array([INFO[features[0]], INFO[features[1]]]).T
    forest = IsolationForest(
        n_estimators=100,  #max_samples=1000,
        random_state=0,
        contamination=num_outlier / 343546.0  # number of nodes
    )
    fig = scatter_plot(INFO[X], INFO[Y], INFO['IDs'], discription[Y],
                       discription[X],
                       discription[Y] + ' vs ' + discription[X],
                       compare_value[X])
    forest.fit(pair_features)
    scores = forest.decision_function(pair_features[outlier_ids, :])
    rank_list = sorted([(outliers[i], -s) for (i, s) in enumerate(scores)],
                       key=lambda x: x[1],
                       reverse=True)
    rank_matrix.append(rank_list)
pp.close()

print rank_matrix

#  runs, properly till this, why is generate_graph returning nothing?
scaled_matrix, normal_matrix = ranklist.generate_graph(P_val, num_outlier,
                                                       rank_matrix)
plots = plotSpot(budget, scaled_matrix, "SpellOut")
frequencies = generate_frequency_list(plots, scaled_matrix)
for i, plot in enumerate(plots):
    fig = scatter_outliers(plot, INFO['IDs'], frequencies)
    n_samples_train = n_samples // 2

    X = X.astype(float)
    X_train = X[:n_samples_train, :]
    X_test = X[n_samples_train:, :]
    y_train = y[:n_samples_train]
    y_test = y[n_samples_train:]

    print('--- Fitting the IsolationForest estimator...')
    model = IsolationForest(n_jobs=-1, random_state=random_state)
    tstart = time()
    model.fit(X_train)
    fit_time = time() - tstart
    tstart = time()

    scoring = -model.decision_function(X_test)  # the lower, the more abnormal

    print("--- Preparing the plot elements...")
    if with_decision_function_histograms:
        fig, ax = plt.subplots(3, sharex=True, sharey=True)
        bins = np.linspace(-0.5, 0.5, 200)
        ax[0].hist(scoring, bins, color='black')
        ax[0].set_title('Decision function for %s dataset' % dat)
        ax[1].hist(scoring[y_test == 0], bins, color='b', label='normal data')
        ax[1].legend(loc="lower right")
        ax[2].hist(scoring[y_test == 1], bins, color='r', label='outliers')
        ax[2].legend(loc="lower right")

    # Show ROC Curves
    predict_time = time() - tstart
    fpr, tpr, thresholds = roc_curve(y_test, scoring)
Example #19
0
def test_behaviour_param():
    X_train = [[1, 1], [1, 2], [2, 1]]
    clf1 = IsolationForest(behaviour='old').fit(X_train)
    clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train)
    assert_array_equal(clf1.decision_function([[2., 2.]]),
                       clf2.decision_function([[2., 2.]]))
Example #20
0
#print(marks.head(10))
#print(marks)

### for only Maths ###

#model=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.2), max_features=1.0)
#model.fit(marks[['Mathematics']])

# Prediction
#marks['anomailes_scores_math']=model.decision_function(marks[['Mathematics']])
#marks['anomaly_math']=model.predict(marks[['Mathematics']])

# here, 1 for good data and -1 for bad data
#print(marks)

## for both Eng and Maths ##

model = IsolationForest(n_estimators=100,
                        max_samples='auto',
                        contamination=float(0.2),
                        max_features=1.0)
model.fit(marks[['English', 'Mathematics']])

# Prediction
marks['anomailes_scores_both'] = model.decision_function(
    marks[['English', 'Mathematics']])
marks['anomaly_for_both'] = model.predict(marks[['English', 'Mathematics']])

# here, 1 for good data and -1 for bad data
print(marks)
Example #21
0
    def insights_model(self, entry_info, repo_id):

        logging.info("Discovering insights for task with entry info: {}\n".format(entry_info))

        """ Collect data """
        base_url = 'http://{}:{}/api/unstable/repo-groups/9999/repos/{}/'.format(
            self.config['api_host'], self.config['api_port'], repo_id)
        
        # Dataframe to hold all endpoint results
        # Subtract configurable amount of time
        begin_date = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) - datetime.timedelta(days=self.training_days)
        index = pd.date_range(begin_date, periods=self.training_days, freq='D')
        df = pd.DataFrame(index=index)
        
        # Hit and discover insights for every endpoint we care about
        for endpoint, field in self.metrics.items():
            # Hit endpoint
            url = base_url + endpoint
            logging.info("Hitting endpoint: " + url + "\n")
            try:
                data = requests.get(url=url).json()
            except:
                data = json.loads(json.dumps(requests.get(url=url).text))

            if len(data) == 0:
                logging.info("Endpoint with url: {} returned an empty response. Moving on to next endpoint.\n".format(url))
                continue
                
            if 'date' not in data[0]:
                logging.info("Endpoint {} is not a timeseries, moving to next endpoint.\n".format(endpoint))
                continue
            
            metric_df = pd.DataFrame.from_records(data)
            metric_df.index = pd.to_datetime(metric_df['date'], utc=True).dt.date
            df = df.join(metric_df[field]).fillna(0)
            df.rename(columns={field: "{} - {}".format(endpoint, field)}, inplace=True)

        """ End collect endpoint data """

        # If none of the endpoints returned data
        if df.size == 0:
            logging.info("None of the provided endpoints provided data for this repository. Anomaly detection is 'done'.\n")
            self.register_task_completion(entry_info, repo_id, "insights")
            return

        """ Deletion of old insights """

        # Delete previous insights not in the anomaly_days param
        min_date = datetime.datetime.now() - datetime.timedelta(days=self.anomaly_days)
        logging.info("MIN DATE: {}\n".format(min_date))
        logging.info("Deleting out of date records ...\n")
        delete_record_SQL = s.sql.text("""
            DELETE 
                FROM
                    repo_insights_records
                WHERE
                    repo_id = :repo_id
                    AND ri_date < :min_date
        """)
        result = self.db.execute(delete_record_SQL, repo_id=repo_id, min_date=min_date)

        logging.info("Deleting out of date data points ...\n")
        delete_points_SQL = s.sql.text("""
            DELETE 
                FROM
                    repo_insights
                USING (
                    SELECT ri_metric, ri_field 
                    FROM (
                        SELECT * 
                        FROM repo_insights
                        WHERE ri_fresh = TRUE
                        AND repo_id = :repo_id
                        AND ri_date < :min_date
                    ) old_insights
                ) to_delete
                WHERE repo_insights.ri_metric = to_delete.ri_metric
                AND repo_insights.ri_field = to_delete.ri_field
        """)
        result = self.db.execute(delete_points_SQL, repo_id=repo_id, min_date=min_date)

        # get table values to check for dupes later on
        insight_table_values = self.get_table_values(['*'], ['repo_insights_records'], where_clause="WHERE repo_id = {}".format(repo_id))

        to_model_columns = df.columns[0:len(self.metrics)+1]

        model = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(self.contamination), \
                        max_features=1.0, bootstrap=False, n_jobs=-1, random_state=32, verbose=0)
        model.fit(df[to_model_columns])

        def classify_anomalies(df,metric):
            df = df.sort_values(by='date_col', ascending=False)
            
            # Shift metric values by one date to find the percentage chage between current and previous data point
            df['shift'] = df[metric].shift(-1)
            df['percentage_change'] = ((df[metric] - df['shift']) / df[metric]) * 100
            
            # Categorise anomalies as 0 - no anomaly, 1 - low anomaly , 2 - high anomaly
            df['anomaly_class'].loc[df['anomaly_class'] == 1] = 0
            df['anomaly_class'].loc[(df['anomaly_class'] == -1) & (df[metric] != 0) & (df[metric] != 1)] = 2
            max_anomaly_score = df['score'].loc[df['anomaly_class'] == 2].max()
            medium_percentile = df['score'].quantile(0.24)
            df['anomaly_class'].loc[(df['score'] > max_anomaly_score) & (df['score'] <= medium_percentile)] = 1
            return df

        for i, metric in enumerate(to_model_columns):

            # Fit the model to the data returned from the endpoints
            model.fit(df.iloc[:,i:i+1])
            pred = model.predict(df.iloc[:,i:i+1])

            # Create df and adopt previous index from when we called the endpoints
            anomaly_df = pd.DataFrame()
            anomaly_df['date_col'] = df.index
            anomaly_df.index = df.index
            
            # Find decision function to find the score and classify anomalies
            anomaly_df['score'] = model.decision_function(df.iloc[:,i:i+1])
            anomaly_df[metric] = df.iloc[:,i:i+1]
            anomaly_df['anomaly_class'] = pred

            # Get the indexes of outliers in order to compare the metrics with use case anomalies if required
            outliers = anomaly_df.loc[anomaly_df['anomaly_class'] == -1]
            outlier_index = list(outliers.index)
            anomaly_df = classify_anomalies(anomaly_df,metric)

            # Filter the anomaly_df by days we want to detect anomalies
            begin_detection_date = datetime.datetime.now() - datetime.timedelta(days=self.anomaly_days)
            detection_tuples = anomaly_df.index > begin_detection_date
            anomaly_df = anomaly_df.loc[detection_tuples]

            # Make a copy of the df for logging of individual tuples in the repo_insights table
            anomaly_df_copy = anomaly_df.copy()

            # Calculate mean
            mean = anomaly_df[metric].mean()

            # Make columns numeric for argmax to function properly
            for col in anomaly_df.columns:
                anomaly_df[col] = pd.to_numeric(anomaly_df[col])

            # Split into endpoint and field name
            split = metric.split(" - ")

            most_recent_anomaly_date = None
            most_recent_anomaly = None

            insight_count = 0

            while True:

                if anomaly_df.loc[anomaly_df['anomaly_class'] == 2].empty:
                    logging.info("No more anomalies to be found for metric: {}\n".format(metric))
                    break

                next_recent_anomaly_date = anomaly_df.loc[anomaly_df['anomaly_class'] == 2]['anomaly_class'].idxmax()
                logging.info("Next most recent date: \n{}\n".format(next_recent_anomaly_date))
                next_recent_anomaly = anomaly_df.loc[anomaly_df.index == next_recent_anomaly_date]
                logging.info("Next most recent anomaly: \n{}\n{}\n".format(next_recent_anomaly.columns.values, 
                    next_recent_anomaly.values))

                if insight_count == 0:
                    most_recent_anomaly_date = next_recent_anomaly_date
                    most_recent_anomaly = next_recent_anomaly

                # Format numpy 64 date into timestamp
                date64 = next_recent_anomaly.index.values[0]
                ts = (date64 - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
                ts = datetime.datetime.utcfromtimestamp(ts)

                insight_exists = ((insight_table_values['ri_date'] == ts) & \
                    (insight_table_values['ri_metric'] == split[0]) & (insight_table_values['ri_field'] == split[1])).any()

                if not insight_exists:

                    # Insert record in records table and send record to slack bot
                    record = {
                        'repo_id': repo_id,
                        'ri_metric': split[0],
                        'ri_field': split[1],
                        'ri_value': next_recent_anomaly.iloc[0][metric],
                        'ri_date': ts,
                        'ri_score': next_recent_anomaly.iloc[0]['score'],
                        'ri_detection_method': 'Isolation Forest',
                        "tool_source": self.tool_source,
                        "tool_version": self.tool_version,
                        "data_source": self.data_source
                    }
                    result = self.db.execute(self.repo_insights_records_table.insert().values(record))
                    logging.info("Primary key inserted into the repo_insights_records table: {}\n".format(
                        result.inserted_primary_key))
                    self.results_counter += 1

                    # Send insight to Jonah for slack bot
                    self.send_insight(record, abs(next_recent_anomaly.iloc[0][metric] - mean))

                    insight_count += 1
                else:
                    logging.info("Duplicate insight found, skipping insertion. "
                        "Continuing iteration of anomalies...\n")

                anomaly_df = anomaly_df[anomaly_df.index < next_recent_anomaly_date]


            # If no insights for this metric were found, then move onto next metric
            # (since there is no need to insert the endpoint results below)
            if insight_count == 0:
                continue

            # Begin inserting to table to build frontend charts
            for tuple in anomaly_df_copy.itertuples():
                try:
                    # Format numpy 64 date into timestamp
                    date64 = tuple.Index
                    ts = (date64 - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
                    ts = datetime.datetime.utcfromtimestamp(ts)

                    data_point = {
                        'repo_id': repo_id,
                        'ri_metric': split[0],
                        'ri_field': split[1],
                        'ri_value': tuple._3,
                        'ri_date': ts,
                        'ri_fresh': 0 if date64 < most_recent_anomaly_date else 1,
                        'ri_score': most_recent_anomaly.iloc[0]['score'],
                        'ri_detection_method': 'Isolation Forest',
                        "tool_source": self.tool_source,
                        "tool_version": self.tool_version,
                        "data_source": self.data_source
                    }
                    result = self.db.execute(self.repo_insights_table.insert().values(data_point))
                    logging.info("Primary key inserted into the repo_insights table: {}\n".format(
                        result.inserted_primary_key))

                    logging.info("Inserted data point for metric: {}, date: {}, value: {}\n".format(metric, ts, tuple._3))
                except Exception as e:
                    logging.info("error occurred while storing datapoint: {}\n".format(repr(e)))
                    break

        self.register_task_completion(entry_info, repo_id, "insights")
 def detect_anomalies(self, data, **params):
     iso_forest = IsolationForest(verbose=1)
     iso_forest.set_params(**params)
     iso_forest.fit(data)
     return iso_forest.decision_function(
         data)  # The anomaly score. The lower, the more abnormal.
Example #23
0
        strong_outlier = False
        try:
            if frequency_tree[0][0] > (
                    4 * float(frequency_tree[1][0])
            ):  #if highest frequency is 4 times higher as the second one, remove from trainingdata
                frequency_tree = frequency_tree[1:]
                col = [red]  #classify highest value as Hot
                strong_outlier = True
        except:
            pass

    #Create the isolation forest model and train and test.
        clf = IsolationForest(random_state=0,
                              bootstrap=False).fit(frequency_tree)
        outlier_score = clf.decision_function(
            frequency_tree
        )  #there is also predict as a method (value 1 or -1) output
        outlier_classification = clf.predict(
            frequency_tree)  # -1 is outlier and 1 is inliner

        #label the results of the classification
        x_pos = [i for i, _ in enumerate(mutation)]
        #if there is no negative value, blau und green
        outcome_score = sum(
            1 for number in outlier_score
            if number < 0)  #checking if there is a negative value
        outcome_classification = sum(
            1 for number in outlier_classification if number < 0
        )  #checking if classification thinks the same because sometimes a outliner-score accidentally swaps to a negative number

        #coloring the data according to the outliner score generated with following criteria
bootstrap:布尔型参数,默认取False,表示构建iTree时有放回地进行抽样;
'''

# 设置训练样本数及异常样本比例
n_samples = 10000  
outliers_fraction = 0.25    
n_inliers = int((1. - outliers_fraction) * n_samples)  
n_outliers = int(outliers_fraction * n_samples)  
  
# //表示整数除法  
rng = np.random.RandomState(123)    
X = 0.3 * rng.randn(n_inliers // 2, 2)  

# 构建正常样本与异常样本  
X_train = np.r_[X + 2, X - 2]   
outliers = rng.uniform(low=-6, high=6, size=(n_outliers, 2))

# 正常样本与异常样本的融合  
X_train = np.r_[X_train, outliers]  

clf = IsolationForest(contamination=outliers_fraction, random_state=2018, n_jobs=-1, behaviour="new")  
# predict / fit_predict方法返回每个样本是否为正常值,若返回1表示正常值,返回-1表示异常值
y_pred_train = clf.fit_predict(X_train)  
pred = np.array(['正常' if i==1 else '异常' for i in y_pred_train])

# 分数越小于0,越有可能是异常值
scores_pred = clf.decision_function(X_train) 
dict_ = {'anomaly_score':scores_pred, 'y_pred':y_pred_train, 'result':pred}
scores = pd.DataFrame(dict_)
print(scores.sample(5))
Example #25
0
             barmode='group',
             height=400)

fig.update_yaxes(title_text="Model Metrics")
fig.update_layout(title_text="Model Performance")
fig.show()
# -

# ## Feature Selection, resampling and data transformation

# +
#Anomaly Detection
from sklearn.ensemble import IsolationForest
iforest = IsolationForest(n_estimators=100, contamination=0.01)
pred = iforest.fit_predict(X_train_prepared)
score = iforest.decision_function(X_train_prepared)
from numpy import where
anom_index = where(pred == -1)
values = X.iloc[anom_index]

for i in values.index:
    X_train_prepared = X_train_prepared.drop(i)
    y_train = y_train.drop(i)
# -

ctr = len(values)
print("Number of observations dropped = {}".format(ctr))

# +
# Modelling with balanced target
Example #26
0
w = 30
m = 10
st = time.time()
training_paa = paa.ts_to_PAA(w, m, training_ts_list)
testing_paa = paa.ts_to_PAA(w, m, testing_ts_list)
print("PAA time = {}".format(time.time() - st))
#print (training_paa)
print(training_paa.shape)
t1 = time.time()
IF1 = IsolationForest(max_samples=256, n_estimators=100, contamination=0.01)
IF1.fit(training_paa)
#cont = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2, 0.3, 0.4, 0.5]
cont = [0.01, 0.02, 0.04, 0.08, 0.1, 0.2, 0.4, 0.5]
for val in cont:

    anomaly_score = IF1.decision_function(testing_paa)
    anomaly_score = [0 for x in range(w)] + [z for z in anomaly_score]
    predict_score = predict(anomaly_score, val)
    predict_socre = [z for z in predict_score]
    plot_graphs(training_ts_list, testing_ts_list, anomaly_score,
                predict_score)

    plt.subplot(411)
    plt.title('Training Signal')
    plt.xlabel('Instance Number')
    plt.ylabel('Value')
    plt.plot(range(len(training_ts_list[0])), training_ts_list[0], color='b')
    plt.plot(range(len(training_ts_list[1])), training_ts_list[1], color='r')
    plt.plot(range(len(training_ts_list[2])), training_ts_list[2], color='g')

    plt.subplot(412)
Example #27
0
class IForest(BaseDetector):
    """Wrapper of scikit-learn Isolation Forest with more functionalities.

    The IsolationForest 'isolates' observations by randomly selecting a
    feature and then randomly selecting a split value between the maximum and
    minimum values of the selected feature.
    See :cite:`liu2008isolation,liu2012isolation` for details.

    Since recursive partitioning can be represented by a tree structure, the
    number of splittings required to isolate a sample is equivalent to the path
    length from the root node to the terminating node.

    This path length, averaged over a forest of such random trees, is a
    measure of normality and our decision function.

    Random partitioning produces noticeably shorter paths for anomalies.
    Hence, when a forest of random trees collectively produce shorter path
    lengths for particular samples, they are highly likely to be anomalies.

    Parameters
    ----------
    n_estimators : int, optional (default=100)
        The number of base estimators in the ensemble.

    max_samples : int or float, optional (default="auto")
        The number of samples to draw from X to train each base estimator.

            - If int, then draw `max_samples` samples.
            - If float, then draw `max_samples * X.shape[0]` samples.
            - If "auto", then `max_samples=min(256, n_samples)`.

        If max_samples is larger than the number of samples provided,
        all samples will be used for all trees (no sampling).

    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set, i.e. the proportion
        of outliers in the data set. Used when fitting to define the threshold
        on the decision function.

    max_features : int or float, optional (default=1.0)
        The number of features to draw from X to train each base estimator.

            - If int, then draw `max_features` features.
            - If float, then draw `max_features * X.shape[1]` features.

    bootstrap : boolean, optional (default=False)
        If True, individual trees are fit on random subsets of the training
        data sampled with replacement. If False, sampling without replacement
        is performed.

    n_jobs : integer, optional (default=1)
        The number of jobs to run in parallel for both `fit` and `predict`.
        If -1, then the number of jobs is set to the number of cores.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    verbose : int, optional (default=0)
        Controls the verbosity of the tree building process.

    Attributes
    ----------
    estimators_ : list of DecisionTreeClassifier
        The collection of fitted sub-estimators.

    estimators_samples_ : list of arrays
        The subset of drawn samples (i.e., the in-bag samples) for each base
        estimator.

    max_samples_ : integer
        The actual number of samples

    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """

    def __init__(self, n_estimators=100,
                 max_samples="auto",
                 contamination=0.1,
                 max_features=1.,
                 bootstrap=False,
                 n_jobs=1,
                 random_state=None,
                 verbose=0):
        super(IForest, self).__init__(contamination=contamination)
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """
        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = IsolationForest(n_estimators=self.n_estimators,
                                         max_samples=self.max_samples,
                                         contamination=self.contamination,
                                         max_features=self.max_features,
                                         bootstrap=self.bootstrap,
                                         n_jobs=self.n_jobs,
                                         random_state=self.random_state,
                                         verbose=self.verbose)
        self.detector_.fit(X=X,
                           y=None,
                           sample_weight=None)

        # invert decision_scores_. Outliers comes with higher outlier scores.
        self.decision_scores_ = invert_order(
            self.detector_.decision_function(X))
        self._process_decision_scores()
        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        # invert outlier scores. Outliers comes with higher outlier scores
        return invert_order(self.detector_.decision_function(X))

    @property
    def estimators_(self):
        """The collection of fitted sub-estimators.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.estimators_

    @property
    def estimators_samples_(self):
        """The subset of drawn samples (i.e., the in-bag samples) for
        each base estimator.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.estimators_samples_

    @property
    def max_samples_(self):
        """The actual number of samples.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.max_samples_
states = ['failed', 'successful']  #only select successful or failed projects
kickstarters = new_dataset[new_dataset.state.isin(states)]

kickstarters = kickstarters.dropna()  #drop empty rows
kickstarters = kickstarters.reset_index(drop=True)

############ Detecting the anomalies in the dataset ##############
model = IsolationForest(n_estimators=100,
                        max_samples='auto',
                        contamination=float(0.01),
                        max_features=1.0,
                        random_state=0)
model.fit(kickstarters[['goal']])

#identify anomalies
kickstarters['score'] = model.decision_function(kickstarters[['goal']])
kickstarters['anomaly'] = model.predict(kickstarters[['goal']])
kickstarters.head(20)

#index of observations that are anomalies
anomaly = kickstarters.loc[kickstarters['anomaly'] == -1]
anomaly_index = list(anomaly.index)

#only keep projects that are not anomalies
kickstarters = kickstarters[~kickstarters.index.isin(anomaly_index)]

############# Calculated fields ##################################

#1 - lenght of the name of the project
kickstarters['name_length'] = kickstarters['name'].str.len()
Example #29
0
def test_ioforest(stamp):
    '''
  with open('maintenanceapp/static/net.csv','rb') as file:
      reader = csv.DictReader(file)
      test = [test for test in reader]
  '''
    '''

  with open('test1.csv','rb') as file1:
      reader1 = csv.DictReader(file1)
      test1 = [item for item in reader1]
  '''

    client = InfluxDBClient('localhost', port=8086, database='telegraf')
    #stamp = time.time()
    num1 = int(stamp)
    num2 = num1 - 300
    str1 = 'select "bytes_recv" , "bytes_sent" from net where time >= ' + str(
        num2) + 's and time <= ' + str(num1) + 's and bytes_recv != 0'
    temp = client.query(str1)
    test1 = temp.get_points()

    if len(temp) == 0:
        return jsonify({
            "error code": 416,
            "error message": "Reading data is error"
        }), 416

# thelength = len(test)
    apache = []
    apache2 = []
    origin = []
    apache1 = []
    apache21 = []
    origin1 = []
    time_store = []
    key = 'bytes_recv'
    key2 = 'bytes_sent'
    key3 = 'time'
    i = 0
    '''
  while i < thelength:
      apache.append(test[i][key])
      apache2.append(test[i][key2])
      time_store.append(test[i][key3])

      if i!=0:
        train_req = float(apache[i])- float(apache[i-1])
        train_sec = float(apache2[i]) - float(apache2[i-1])
        origin.append([train_req/1024000,train_sec/1024000])
      i += 1
  '''
    '''
  while j < thelength1:
      apache1.append(test1[j][key])
      apache21.append(test1[j][key2])
      origin1.append([float(apache1[j])/1024,float(apache21[j])])
      j += 1
  '''
    j = 0

    for item in test1:
        apache1.append(item[u'bytes_recv'])
        apache21.append(item[u'bytes_sent'])
        time_store.append(item[u'time'])
        if j != 0:
            test_req = float(apache1[j]) - float(apache1[j - 1])
            test_sec = float(apache21[j]) - float(apache21[j - 1])
            origin1.append([test_req / 10240, test_sec / 10240])
        j += 1

# train = np.array(origin)
    ceshi = np.array(origin1)
    #print(np.shape(ceshi))

    rng = np.random.RandomState(42)
    clf = IsolationForest(max_samples=300, random_state=rng)

    clf.fit(ceshi)
    anomaly_score = clf.decision_function(ceshi)
    #print(anomaly_score)

    bad_domains = []
    out_point1 = []

    threshold = -0.15
    i = 0
    count = 0

    for item in anomaly_score:
        if item < threshold:
            bad_domains.append(time_store[i])
            out_point1.append(origin1[i])
            count += 1
        i += 1

    out_point = np.zeros(shape=(len(out_point1), 2))
    out_point = np.array(out_point1)

    #print(out_point)

    if len(out_point) != 0:

        b2 = plt.scatter(ceshi[:, 0],
                         ceshi[:, 1],
                         c='black',
                         s=20,
                         edgecolor='k')
        b1 = plt.scatter(out_point[:, 0],
                         out_point[:, 1],
                         c='red',
                         s=20,
                         edgecolor='k')
        xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

        plt.axis('tight')
        plt.xlim((-500, 500))
        plt.ylim((-500, 500))

        plt.legend([b2, b1], ["test data", "out point"], loc="upper left")
    else:
        b2 = plt.scatter(ceshi[:, 0],
                         ceshi[:, 1],
                         c='black',
                         s=20,
                         edgecolor='k')
        xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

        plt.axis('tight')
        plt.xlim((-500, 500))
        plt.ylim((-500, 500))

    filename = 'ioforest' + str(stamp) + ".png"
    #plt.savefig(filename)

    plt.savefig('maintenanceapp/static/' + filename)
    os.system("cd maintenanceapp/static && python test.py")

    if len(bad_domains) == 0:
        return jsonify({
            'count': count,
            'time': bad_domains,
            'filename': filename
        })
    else:
        return jsonify({
            'count': count,
            'time': bad_domains,
            'filename': filename
        })
    '''
class IsoForest(object):
    def __init__(self,
                 dataset,
                 n_estimators=100,
                 max_samples='auto',
                 contamination=0.1,
                 **kwargs):

        # load dataset
        load_dataset(self, dataset)

        # initialize
        self.isoForest = None
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.contamination = contamination
        self.initialize_isoForest(seed=self.data.seed, **kwargs)

        # train and test time
        self.clock = 0
        self.clocked = 0
        self.train_time = 0
        self.test_time = 0

        # Scores and AUC
        self.diag = {}

        self.diag['train'] = {}
        self.diag['val'] = {}
        self.diag['test'] = {}

        self.diag['train']['scores'] = np.zeros((len(self.data._y_train), 1))
        self.diag['val']['scores'] = np.zeros((len(self.data._y_val), 1))
        self.diag['test']['scores'] = np.zeros((len(self.data._y_test), 1))

        self.diag['train']['auc'] = np.zeros(1)
        self.diag['val']['auc'] = np.zeros(1)
        self.diag['test']['auc'] = np.zeros(1)

        self.diag['train']['acc'] = np.zeros(1)
        self.diag['val']['acc'] = np.zeros(1)
        self.diag['test']['acc'] = np.zeros(1)

        # AD results log
        self.ad_log = AD_Log()

        # diagnostics
        self.best_weight_dict = None  # attribute to reuse nnet plot-functions

    def initialize_isoForest(self, seed=0, **kwargs):

        self.isoForest = IsolationForest(n_estimators=self.n_estimators,
                                         max_samples=self.max_samples,
                                         contamination=self.contamination,
                                         n_jobs=-1,
                                         random_state=seed,
                                         **kwargs)

    def load_data(self, data_loader=None, pretrain=False):

        self.data = data_loader()

    def start_clock(self):

        self.clock = time.time()

    def stop_clock(self):

        self.clocked = time.time() - self.clock
        print("Total elapsed time: %g" % self.clocked)

    def train(self):

        if self.data._X_train.ndim > 2:
            X_train_shape = self.data._X_train.shape
            X_train = self.data._X_train.reshape(X_train_shape[0], -1)
        else:
            X_train = self.data._X_train

        print("Starting training...")
        self.start_clock()

        self.isoForest.fit(X_train.astype(np.float32))

        self.stop_clock()
        self.train_time = self.clocked

    def predict(self, which_set='train'):

        assert which_set in ('train', 'test')

        if which_set == 'train':
            X = self.data._X_train
            y = self.data._y_train
        if which_set == 'test':
            X = self.data._X_test
            y = self.data._y_test

        # reshape to 2D if input is tensor
        if X.ndim > 2:
            X_shape = X.shape
            X = X.reshape(X_shape[0], -1)

        print("Starting prediction...")
        self.start_clock()

        scores = (-1.0) * self.isoForest.decision_function(
            X.astype(np.float32))  # compute anomaly score
        y_pred = (self.isoForest.predict(X.astype(np.float32))
                  == -1) * 1  # get prediction

        self.diag[which_set]['scores'][:, 0] = scores.flatten()
        self.diag[which_set]['acc'][0] = 100.0 * sum(y == y_pred) / len(y)

        if sum(y) > 0:
            auc = roc_auc_score(y, scores.flatten())
            self.diag[which_set]['auc'][0] = auc

        self.stop_clock()
        if which_set == 'test':
            self.test_time = self.clocked

    def dump_model(self, filename=None):

        dump_isoForest(self, filename)

    def load_model(self, filename=None):

        assert filename and os.path.exists(filename)

        load_isoForest(self, filename)

    def log_results(self, filename=None):
        """
        log the results relevant for anomaly detection
        """

        self.ad_log['train_auc'] = self.diag['train']['auc'][-1]
        self.ad_log['train_accuracy'] = self.diag['train']['acc'][-1]
        self.ad_log['train_time'] = self.train_time

        self.ad_log['test_auc'] = self.diag['test']['auc'][-1]
        self.ad_log['test_accuracy'] = self.diag['test']['acc'][-1]
        self.ad_log['test_time'] = self.test_time

        self.ad_log.save_to_file(filename=filename)
Example #31
0
def main():

    train_age = np.random.randint(18, 60, [1000, 1])
    train_salary = np.random.randint(30, 90, [1000, 1])
    #sex = np.random.randint(1,3,[100,1])
    train = np.concatenate((train_age, train_salary), axis=1)

    test_age = np.random.randint(18, 60, [100, 1])
    test_salary = np.random.randint(30, 90, [100, 1])
    #sex = np.random.randint(1,3,[100,1])
    test = np.concatenate((test_age, test_salary), axis=1)

    outliers_age = np.random.randint(1, 10, [100, 1])
    outliers_salary = np.random.randint(10, 20, [100, 1])
    #sex = np.random.randint(1,3,[100,1])
    outliers = np.concatenate((outliers_age, outliers_salary), axis=1)

    outliers1_age = np.random.randint(61, 100, [100, 1])
    outliers1_salary = np.random.randint(100, 200, [100, 1])
    #sex = np.random.randint(1,3,[100,1])
    outliers1 = np.concatenate((outliers1_age, outliers1_salary), axis=1)

    clf = IsolationForest(max_samples=100, contamination=0.01)
    clf.fit(train)

    Z = clf.predict(train)
    z_neg = np.zeros(shape=(1, 2))
    for i in range(0, len(Z)):
        if (Z[i] < 0):
            z_neg = np.row_stack((z_neg, train[i]))
    z_neg = np.delete(z_neg, 0, axis=0)

    Z1 = clf.predict(test)
    z1_neg = np.zeros(shape=(1, 2))
    for i in range(0, len(Z1)):
        if (Z1[i] < 0):
            z1_neg = np.row_stack((z1_neg, test[i]))
    z1_neg = np.delete(z1_neg, 0, axis=0)

    Z2 = clf.predict(outliers)
    z2_neg = np.zeros(shape=(1, 2))
    for i in range(0, len(Z2)):
        if (Z2[i] < 0):
            z2_neg = np.row_stack((z2_neg, outliers[i]))
    z2_neg = np.delete(z2_neg, 0, axis=0)

    Z3 = clf.predict(outliers1)
    z3_neg = np.zeros(shape=(1, 2))
    for i in range(0, len(Z3)):
        if (Z3[i] < 0):
            z3_neg = np.row_stack((z3_neg, outliers1[i]))
    z3_neg = np.delete(z3_neg, 0, axis=0)

    xx, yy = np.meshgrid(np.linspace(1, 100, 50), np.linspace(1, 200, 50))
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.title("IsolationForest")
    plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

    b1 = plt.scatter(z_neg[:, 0], z_neg[:, 1], c='red')
    b2 = plt.scatter(z1_neg[:, 0], z1_neg[:, 1], c='green')
    c = plt.scatter(z2_neg[:, 0], z2_neg[:, 1], c='blue')
    d = plt.scatter(z3_neg[:, 0], z3_neg[:, 1], c='black')
    plt.show()
Example #32
0
    if i in [1, 2, 3, 8, 9, 10]:
        labels_val[index] = 1

# fit the model

param = {}
for max_features_ in range(1, 12):
    for contamination_ in np.arange(0, 0.2, 0.01):
        clf = IsolationForest(n_estimators=300,
                              contamination=contamination_,
                              max_samples=32,
                              bootstrap=False,
                              max_features=max_features_)
        clf.fit(X_train)

        val_score = clf.decision_function(X_val)

        x, y, threshold = roc_curve(labels_val, -val_score)
        a = auc(x, y)
        print('max_features:%s  contamination:%s  auc:%s' %
              (max_features_, contamination_, a))
        param[a] = (max_features_, contamination_)

best_model = list(param.keys())[np.argmax(list(param.keys()))]
best_param = param[best_model]
print('best:auc:%s   contamination:%s   max_features:%s' %
      (best_model, best_param[1], best_param[0]))

clf = IsolationForest(n_estimators=300,
                      contamination=best_param[1],
                      max_samples=256,
Example #33
0
def computeDiff_RF(ntrees=1024, sample_size_ratio=.33, alpha0=.1):
    # load data
    f = open('PKL/donnutsDataProblem.pkl', 'rb')
    [Xn, Xnt, Xa, Xb, Xab] = pickle.load(f)
    f.close()
    
    if sample_size_ratio >1:
        sample_size=sample_size_ratio
    else:
        sample_size=int(sample_size_ratio*len(Xn))

    xn=Xn[:,0]
    yn=Xn[:,1]
    xa=Xa[:,0]
    ya=Xa[:,1]

    xb=Xb[:,0]
    yb=Xb[:,1]

    pathlib.Path('./FIG').mkdir(parents=True, exist_ok=True) 
    # plotting the donnuts data
    plt.figure(1)
    plt.plot(xn, yn, 'bo', markersize=10)
    plt.savefig('FIG/clustersDonnuts0.pdf')

    nn=len(Xa)
    plt.figure(2)
    plt.plot(xn, yn, 'bo', xa[0:nn], ya[0:nn], 'rs')
    plt.savefig('FIG/clustersDonnuts1.pdf')

    plt.figure(3)
    plt.plot(xn, yn, 'bo', xa[0:nn], ya[0:nn], 'rs', xb[0:nn], yb[0:nn], 'gd')
    plt.xticks(size=14)
    plt.yticks(size=14)
    plt.savefig('FIG/clustersDonnuts2.pdf')

    # Creating Forest on normal data + anomalies labels
    print('building the Diff_RF ...')

    diff_rf = DiFF_TreeEnsemble(sample_size=sample_size, n_trees=ntrees)    # load data
    fit_start = time.time()
    diff_rf.fit(Xn, n_jobs=8)
    fit_stop = time.time()
    fit_time = fit_stop - fit_start
    print(f"fit time {fit_time:3.2f}s")
    n_nodes = sum([t.n_nodes for t in diff_rf.trees])
    print(f"{n_nodes} total nodes in {ntrees} trees")
    
    XT=np.concatenate([Xnt,Xab])
    
    sc_di,sc_ff,sc_diff_rf = diff_rf.anomaly_score(XT,alpha=alpha0)
    sc_diff_rf=np.array(sc_diff_rf)
    sc_ff=np.array(sc_ff)
    sc_di=np.array(sc_di)
    sc_ff=(sc_ff-sc_ff.min())/(sc_ff.max()-sc_ff.min())
    sc_di=(sc_di-sc_di.min())/(sc_di.max()-sc_di.min())
    sc_diff_rf=(sc_diff_rf-sc_diff_rf.min())/(sc_diff_rf.max()-sc_diff_rf.min())

    plt.figure(1000)
    xn=XT[:,0]
    yn=XT[:,1]
    plt.scatter(xn, yn, marker='o', c=sc_ff, cmap='viridis')
    plt.colorbar()
    plt.xticks(size=14)
    plt.yticks(size=14)
    plt.title('DiFF_RF (visiting frequency score) Heat Map')
    plt.savefig('FIG/HeatMap_DiFF_RF_freqScore.pdf')
    
    plt.figure(1001)
    xn=XT[:,0]
    yn=XT[:,1]
    plt.scatter(xn, yn, marker='o', c=sc_diff_rf, cmap='viridis')
    plt.colorbar()
    plt.xticks(size=14)
    plt.yticks(size=14)
    plt.title('DiFF_RF (collective anomaly score) Heat Map')
    plt.savefig('FIG/HeatMap_DiFF_RF_collectiveScore.pdf')
    
    plt.figure(1002)
    xn=XT[:,0]
    yn=XT[:,1]
    plt.scatter(xn, yn, marker='o', c=(sc_di), cmap='viridis')
    plt.colorbar()
    plt.xticks(size=14)
    plt.yticks(size=14)
    plt.title('DiFF_RF (point-wise anomaly score) Heat Map')
    plt.savefig('FIG/HeatMap_DiFF_RF_pointWiseScore.pdf')

    cif = IsolationForest(n_estimators=ntrees, max_samples=sample_size, bootstrap=False, n_jobs=12)
    cif.fit(Xn)
    sc_if = -cif.decision_function(XT)
    sc_if=(sc_if-sc_if.min())/(sc_if.max()-sc_if.min())
    plt.figure(1003)
    xn=XT[:,0]
    yn=XT[:,1]
    plt.scatter(xn, yn, marker='o', c=sc_if, cmap='viridis')
    plt.colorbar()
    plt.xticks(size=14)
    plt.yticks(size=14)
    plt.title('Isolation Forest Heat Map')
    plt.savefig('FIG/HeatMap_IF.pdf')
    plt.show()
    
    y_true = np.array([-1] * len(Xnt) + [1] * len(Xab))
    fpr_IF, tpr_IF, thresholds = roc_curve(y_true, sc_if)
    aucIF=auc(fpr_IF, tpr_IF)
    fpr_D, tpr_D, thresholds = roc_curve(y_true, sc_di)
    aucD=auc(fpr_D, tpr_D)
    fpr_F, tpr_F, thresholds = roc_curve(y_true, sc_ff)
    aucF=auc(fpr_F, tpr_F)
    fpr_DF, tpr_DF, thresholds = roc_curve(y_true, sc_diff_rf)
    aucDF=auc(fpr_DF, tpr_DF)
    print("Isolation Forest AUC=", aucIF)
    print("DiFF_RF (point-wise anomaly score) AUC=", aucD)
    print("DiFF_RF (frequency of visit scoring only) AUC=", aucF)
    print("DiFF_RF (collective anomaly score) AUC=", aucDF)
Example #34
0
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)

X = 0.3 * rng.randn(n_inliers // 2, 2)
X_train = np.r_[X + 2, X - 2]  # 正常样本
X_train = np.r_[X_train,
                np.random.uniform(low=-6, high=6,
                                  size=(n_outliers, 2))]  # 正常样本加上异常样本

# fit the model
clf = IsolationForest(max_samples=n_samples,
                      random_state=rng,
                      contamination=outliers_fraction)
clf.fit(X_train)
# y_pred_train = clf.predict(X_train)
scores_pred = clf.decision_function(X_train)
threshold = stats.scoreatpercentile(
    scores_pred, 100 * outliers_fraction)  # 根据训练样本中异常样本比例,得到阈值,用于绘图

# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-7, 7, 50), np.linspace(-7, 7, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("IsolationForest")
# plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
plt.contourf(xx,
             yy,
             Z,
             levels=np.linspace(Z.min(), threshold, 7),
             cmap=plt.cm.Blues_r)  # 绘制异常点区域,值从最小的到阈值的那部分
Example #35
0
    lof = LocalOutlierFactor(n_neighbors=20)
    ocsvm = OneClassSVM()

    lim_inf = X.min(axis=0)
    lim_sup = X.max(axis=0)
    volume_support = (lim_sup - lim_inf).prod()
    t = np.arange(0, 100 / volume_support, 0.01 / volume_support)
    axis_alpha = np.arange(alpha_min, alpha_max, 0.0001)
    unif = np.random.uniform(lim_inf, lim_sup,
                             size=(n_generated, n_features))

    # fit:
    print('IsolationForest processing...')
    iforest = IsolationForest()
    iforest.fit(X_train)
    s_X_iforest = iforest.decision_function(X_test)
    print('LocalOutlierFactor processing...')
    lof = LocalOutlierFactor(n_neighbors=20)
    lof.fit(X_train)
    s_X_lof = lof.decision_function(X_test)
    print('OneClassSVM processing...')
    ocsvm = OneClassSVM()
    ocsvm.fit(X_train[:min(ocsvm_max_train, n_samples_train - 1)])
    s_X_ocsvm = ocsvm.decision_function(X_test).reshape(1, -1)[0]
    s_unif_iforest = iforest.decision_function(unif)
    s_unif_lof = lof.decision_function(unif)
    s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0]
    plt.subplot(121)
    auc_iforest, em_iforest, amax_iforest = em(t, t_max,
                                               volume_support,
                                               s_unif_iforest,
Example #36
0
def lookout(args):
    # Number of plots to choose
    BUDGET = args.budget

    time = -timer()
    # Load dataset
    if args.dataset == 0:
        full_df = pd.read_csv("HTRU_2.csv")
    else:
        full_df = pd.read_csv("HTRU_2_filtered.csv")

    # Isolate outliers and inliers
    # Points to later be drawn in BLACK
    inlier_df = full_df.loc[full_df['Class'] == 0]
    outlier_df = full_df.loc[full_df['Class'] == 1].reset_index().drop(
        ['index'], axis=1)

    # Remove target column, to not get mixed as a feature
    full_df.drop(columns=['Class'], inplace=True)
    inlier_df.drop(columns=['Class'], inplace=True)
    outlier_df.drop(columns=['Class'], inplace=True)

    # Get all available features and combine them 2 by 2
    all_features = list(full_df.columns)
    feature_pairs = list(ncr(all_features, 2))

    # Matrix with scores for all outliers on all feature-pair plots (row = plot, column = outlier)
    scores = None
    # Isolation Forest instance used to train and score outliers
    classifier = IF()
    for feature_pair in feature_pairs:
        # Model for current feature pair
        classifier.fit(full_df[list(feature_pair)])
        scores = np.array([classifier.decision_function(outlier_df[list(feature_pair)]).tolist()]) if scores is None \
            else np.append(scores, [classifier.decision_function(outlier_df[list(feature_pair)]).tolist()], axis=0)

    # In Isolation Forest, negative scores are considered outliers and positive scores inliers. Original range is [-0.5, 0.5]
    # To ensure greedy approximation optimality we must ensure non negative range, i.e, convert scores to [0,1]
    # To do this, we flip the sign (so negatives become positives and outliers actually have better scores) and add 0.5
    transform_range = np.vectorize(lambda x: 0.5 - x)
    scores = transform_range(scores)

    # Plot selection using greedy heuristic approach (see paper for proof of near optimality)
    S = []  # Final plot selection
    while BUDGET > 0:
        # Only pairs that have not been selected already
        candidate_pairs = list(set(feature_pairs) - set(S))
        candidate_pairs_marginal_gains = []
        for candidate_pair in candidate_pairs:
            # Marginal gain of current feature pair
            candidate_pairs_marginal_gains.append(
                get_marginal_gain(S, candidate_pair, feature_pairs, scores,
                                  args))

        # Get max marginal gain, its index and retrieve respective feature pair
        S.append(candidate_pairs[candidate_pairs_marginal_gains.index(
            max(candidate_pairs_marginal_gains))])
        BUDGET = BUDGET - 1

    time = time + timer()

    print("Final selection: {}".format(S))
    print("Execution time: {0:.2f}s".format(time))
    print("Incrimination: {}".format(
        get_incrimination(S, feature_pairs, scores)))

    # Actual Plotting
    # Tuple of (best_outliers, other_outliers) for each feature pair; IDS ONLY! MUST RETRIEVE FROM OUTLIER DATAFRAME
    outlier_points = []
    # For each selected plot, obtain list of outliers that are best explained by that feature pair (to be drawn in RED)
    # Remaining outliers to be drawn in BLUE
    for feature_pair in S:
        feature_pair_row_idx = get_row_indices([feature_pair],
                                               feature_pairs)[0]
        outliers_max_plot_scores = np.max(scores, axis=0)
        feature_pair_plot_scores = scores[feature_pair_row_idx]
        # Returns boolean array checking if float values are close enough to be considered true
        score_comparison = np.isclose(feature_pair_plot_scores,
                                      outliers_max_plot_scores)
        # IDs (in outliers dataframe) of outliers best explained by this feature pair
        best_outliers_ids = list(
            map(lambda x: x[0],
                filter(lambda y: y[1], enumerate(score_comparison.tolist()))))
        # shape property is a fast, safe way to extract number of rows in dataframe (x-shape)
        remaining_outliers_ids = list(
            set(range(outlier_df.shape[0])) - set(best_outliers_ids))
        outlier_points.append((best_outliers_ids, remaining_outliers_ids))

    # Plotting the chosen features
    for feature_pair, outliers_p in zip(S, outlier_points):
        # Adding inliers
        plot_df = inlier_df.copy()
        plot_df = plot_df[list(feature_pair)]
        plot_df['class'] = 'inlier'
        plot_df['point_size'] = 25

        # Other outliers
        other_outliers = outlier_df.iloc[outliers_p[1]]
        other_outliers = other_outliers[list(feature_pair)]
        other_outliers['class'] = 'other'
        other_outliers['point_size'] = 25

        # Explained outliers
        best_outliers = outlier_df.iloc[outliers_p[0]]
        best_outliers = best_outliers[list(feature_pair)]
        best_outliers['class'] = 'best'
        best_outliers['point_size'] = 35

        # Joining all the dataframes
        plot_df = plot_df.append(best_outliers)
        plot_df = plot_df.append(other_outliers)

        # Actual Plotting
        f, ax = plt.subplots(figsize=(6.5, 6.5))
        sns.scatterplot(x=feature_pair[0],
                        y=feature_pair[1],
                        hue="class",
                        size="point_size",
                        palette=get_palette(plot_df),
                        linewidth=1,
                        legend='full',
                        alpha=0.7,
                        edgecolor='black',
                        data=plot_df,
                        ax=ax)
        plt.autoscale(True)

        # Saving plots
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        plt.savefig('%s/%s_%s.png' % (args.output_dir, *feature_pair))
Example #37
0
            X_test = X[n_samples_train:, :]
            y_train = y[:n_samples_train]
            y_test = y[n_samples_train:]

            # # training only on normal data:
            # X_train = X_train[y_train == 0]
            # y_train = y_train[y_train == 0]

            print('IsolationForest processing...')
            model = IsolationForest()
            tstart = time()
            model.fit(X_train)
            fit_time += time() - tstart
            tstart = time()

            scoring = -model.decision_function(X_test)  # the lower,the more normal
            predict_time += time() - tstart
            fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring)

            if predict_time + fit_time > max_time:
                raise TimeoutError

            f = interp1d(fpr_, tpr_)
            tpr += f(x_axis)
            tpr[0] = 0.

            precision_, recall_ = precision_recall_curve(y_test, scoring)[:2]

            # cluster: old version of scipy -> interpol1d needs sorted x_input
            arg_sorted = recall_.argsort()
            recall_ = recall_[arg_sorted]
Example #38
0
import numpy as np
import matplotlib.pyplot as plt
#get_ipython().magic(u'matplotlib inline')

# ### 1D

# In[2]:

isolation_forest = IsolationForest()

data = np.concatenate(
    (np.random.normal(size=100), np.random.normal(loc=5., size=100)))
isolation_forest.fit(data.reshape(-1, 1))

xx = np.linspace(-4, 10, 1000)
plt.plot(xx, isolation_forest.decision_function(xx.reshape(-1, 1)))
plt.hist(data, normed=True)

# ### 2D

# In[67]:

X = np.random.randn(8000, 2)

# In[68]:

isolation_forest = IsolationForest(n_estimators=15)
isolation_forest.fit(X)

# In[70]:
def find_anomalies_with_shingles(dataset,
                                 data,
                                 window_size=5,
                                 skip_size=None,
                                 ad_type="ifor",
                                 normalize_trend=False,
                                 n_top=10,
                                 outliers_fraction=0.1,
                                 log_transform=False):
    """ Finds anomalous regions in time series using standard unsupervised detectors

    First the time series is chopped up into windows ('shingles').
    Then, a standard anomaly detector is run.
    """
    x = w = None
    n = 0
    ts_data = data

    if log_transform:
        # log-transform now since the values are positive (in context of
        # many real-world datasets line airline); otherwise, values become
        # negative after de-trending
        ts_data = log_transform_series(ts_data, eps=1.0)

    if normalize_trend:
        # remove trend from series
        ts_data = difference_series(ts_data)

    ts = TSeries(ts_data, y=None)
    for x_, _, w in ts.get_shingles(window_size,
                                    skip_size=skip_size,
                                    batch_size=-1):
        x = np.reshape(x_, newshape=(x_.shape[0], -1))
        n = x.shape[0]
        logger.debug("Total instances: %d" % n)
        # logger.debug("Windows:\n%s" % str(w))

    if False:
        feature_ranges = get_sample_feature_ranges(x)
        logger.debug("feature_ranges:\n%s" % str(feature_ranges))

    scores = None
    if ad_type == "ocsvm":
        ad = svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)
        ad.fit(x)
        scores = -ad.decision_function(x).reshape((n, ))
    elif ad_type == "ifor":
        ad = IsolationForest(max_samples=min(256, x.shape[0]),
                             contamination=outliers_fraction,
                             random_state=None)
        ad.fit(x)
        scores = -ad.decision_function(x)
    elif ad_type == "lof":
        ad = LocalOutlierFactor(n_neighbors=35,
                                contamination=outliers_fraction)
        ad.fit(x)
        scores = -ad._decision_function(x)
    elif ad_type == "autoenc":
        n_hiddens = max(1, window_size // 2)
        ad = AutoencoderAnomalyDetector(
            n_inputs=x.shape[1],
            n_neurons=[300, n_hiddens, 300],
            normalize_scale=True,
            activations=[tf.nn.tanh, tf.nn.tanh, tf.nn.tanh, None])
        ad.fit(x)
        scores = -ad.decision_function(x)

    top_anoms = np.argsort(-scores)[0:n_top]
    logger.debug("top scores (%s):\n%s\n%s" %
                 (ad_type, str(top_anoms), str(scores[top_anoms])))

    pdfpath = "temp/timeseries/timeseries_shingles_%s_w%d%s_%s.pdf" % \
              (dataset, window_size, "" if not log_transform else "_log", ad_type)
    dp = DataPlotter(pdfpath=pdfpath, rows=2, cols=1)

    # plot the timeseries anomalies with the detrended series
    pl = dp.get_next_plot()
    pl.set_xlim([0, ts.samples.shape[0]])
    pl.plot(np.arange(0, ts.samples.shape[0]), ts.samples, 'b-', linewidth=0.5)

    for i in top_anoms:
        if w[i] + window_size <= len(ts.samples):
            pl.plot(np.arange(w[i], w[i] + window_size),
                    ts.samples[w[i]:(w[i] + window_size)], 'r-')

    if normalize_trend:
        # plot the original series with anomalous windows
        pl = dp.get_next_plot()
        pl.set_xlim([0, data.shape[0]])
        pl.plot(np.arange(0, data.shape[0]), data, 'b-', linewidth=0.5)

        for i in top_anoms:
            if w[i] + window_size <= len(data):
                pl.plot(np.arange(w[i], w[i] + window_size),
                        data[w[i]:(w[i] + window_size)], 'r-')

    dp.close()
Example #40
0
    if_auc_std = np.zeros(shape=(n_iterations, ), dtype=np.float32)
    sf_auc_std = np.zeros(shape=(n_iterations, ), dtype=np.float32)
    if_precison_std = np.zeros(shape=(n_iterations, ), dtype=np.float32)
    sf_precision_std = np.zeros(shape=(n_iterations, ), dtype=np.float32)
    if_recall_std = np.zeros(shape=(n_iterations, ), dtype=np.float32)
    sf_recall_std = np.zeros(shape=(n_iterations, ), dtype=np.float32)
    if_f1_std = np.zeros(shape=(n_iterations, ), dtype=np.float32)
    sf_f1_std = np.zeros(shape=(n_iterations, ), dtype=np.float32)

    # run
    for i in range(n_iterations):
        print(f'{dataset}, {i+1} / {n_iterations}')
        IF = IsolationForest()
        IF.fit(X_train, y_train)
        if_pred = IF.decision_function(X_test)
        if_auc[i] = roc_auc_score(y_test, if_pred)

        if_class_pred = np.ones_like(if_pred)
        if_class_pred[if_pred <= 0.0] = -1
        if_precison[i] = precision_score(y_test, if_class_pred)
        if_recall[i] = recall_score(y_test, if_class_pred)
        if_f1[i] = f1_score(y_test, if_class_pred)

        sf = IsolationSimilarityForest(**params)
        sf.fit(X_train, y_train)
        sf_pred = sf.decision_function(X_test)
        sf_auc[i] = roc_auc_score(y_test, sf_pred)

        sf_class_pred = np.ones_like(sf_pred)
        sf_class_pred[sf_pred <= 0.0] = -1