Esempio n. 1
0
    def fit(self, X):
        np.random.seed(self.random_state)
        self.n_sample = X.shape[0]

        x_arr, y_arr = [], []
        for i in np.arange(self.lower_bound, self.higher_bound):
            sample_size = 2**i
            sample = X[np.random.choice(self.n_sample,
                                        sample_size,
                                        replace=True)]
            clf = IsolationForest(random_state=self.random_state,
                                  max_samples=sample_size,
                                  contamination='auto').fit(
                                      sample, max_depth=100000000)
            depths = np.mean(clf._compute_actual_depth_leaf(sample)[0], axis=0)

            bins = np.arange(int(depths.min()), int(depths.max() + 2))
            y, x = np.histogram(depths, bins=bins)
            y, x = y + 1, x[:-1]
            break_point = np.argmax(y)

            x_arr.append([i])
            y_arr.append(x[break_point])

        self.reg = LinearRegression(fit_intercept=False).fit(x_arr, y_arr)
        self.clf = IsolationForest(random_state=self.random_state,
                                   max_samples=len(X),
                                   contamination='auto').fit(
                                       X, max_depth=self.max_depth)

        return self
Esempio n. 2
0
class globOut:
    def __init__(self,
                 lower_bound=10,
                 higher_bound=13,
                 max_depth=8,
                 random_state=0):
        self.lower_bound = lower_bound
        self.higher_bound = higher_bound
        self.max_depth = max_depth
        self.random_state = random_state

    def fit(self, X):
        np.random.seed(self.random_state)
        self.n_sample = X.shape[0]

        x_arr, y_arr = [], []
        for i in np.arange(self.lower_bound, self.higher_bound):
            sample_size = 2**i
            sample = X[np.random.choice(self.n_sample,
                                        sample_size,
                                        replace=True)]
            clf = IsolationForest(random_state=self.random_state,
                                  max_samples=sample_size,
                                  contamination='auto').fit(
                                      sample, max_depth=100000000)
            depths = np.mean(clf._compute_actual_depth_leaf(sample)[0], axis=0)

            bins = np.arange(int(depths.min()), int(depths.max() + 2))
            y, x = np.histogram(depths, bins=bins)
            y, x = y + 1, x[:-1]
            break_point = np.argmax(y)

            x_arr.append([i])
            y_arr.append(x[break_point])

        self.reg = LinearRegression(fit_intercept=False).fit(x_arr, y_arr)
        self.clf = IsolationForest(random_state=self.random_state,
                                   max_samples=len(X),
                                   contamination='auto').fit(
                                       X, max_depth=self.max_depth)

        return self

    def average_path_length(self, n):
        n = np.array(n)
        apl = self.reg.predict(np.log2([n]).T)
        apl[apl < 1] = 1
        return apl

    def decision_function(self, X):
        depths, leaves = self.clf._compute_actual_depth_leaf(X)

        new_depths = np.zeros(X.shape[0])
        for d, l in zip(depths, leaves):
            new_depths += d + self.average_path_length(l)

        scores = 2**(-new_depths / (len(self.clf.estimators_) *
                                    self.average_path_length([self.n_sample])))
        return scores
Esempio n. 3
0
 def __init__(self,n_estimators=100, max_samples=256):
     self.model = IsolationForest(n_estimators, max_samples)
     self.threshold = 0.6 ## Recommended threshold in IForest paper.
     self.trainedStatus = False
Esempio n. 4
0
class IFWrapper(object):
    """Wrapper class for the Isolation Forest model.

    Return the anomaly score of each sample with the IsolationForest algorithm

    IsolationForest consists in 'isolate' the observations by randomly
    selecting a feature and then randomly selecting a split value
    between the maximum and minimum values of the selected feature.

    Since recursive partitioning can be represented by a tree structure, the
    number of splitting required to isolate a point is equivalent to the path
    length from the root node to a terminating node.

    This path length, averaged among a forest of such random trees, is a
    measure of abnormality and our decision function.

    Indeed random partitioning produces noticeable shorter paths for anomalies.
    Hence, when a forest of random trees collectively produce shorter path
    lengths for some particular points, then they are highly likely to be
    anomalies.


    Parameters
    ----------
    n_estimators : int, optional (default=100)
        The number of base estimators in the ensemble.

    max_samples : int or float, optional (default=256)
        The number of samples to draw from X to train each base estimator.
            - If int, then draw `max_samples` samples.
            - If float, then draw `max_samples * X.shape[0]` samples.

    max_features : int or float, optional (default=1.0)
        The number of features to draw from X to train each base estimator.
            - If int, then draw `max_features` features.
            - If float, then draw `max_features * X.shape[1]` features.

    bootstrap : boolean, optional (default=True)
        Whether samples are drawn with replacement.

    n_jobs : integer, optional (default=1)
        The number of jobs to run in parallel for both `fit` and `predict`.
        If -1, then the number of jobs is set to the number of cores.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    verbose : int, optional (default=0)
        Controls the verbosity of the tree building process.


    Attributes
    ----------
    estimators_ : list of DecisionTreeClassifier
        The collection of fitted sub-estimators.

    estimators_samples_ : list of arrays
        The subset of drawn samples (i.e., the in-bag samples) for each base
        estimator.

    References
    ----------
    .. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
           Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
    .. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation-based
           anomaly detection." ACM Transactions on Knowledge Discovery from
           Data (TKDD) 6.1 (2012): 3.

    """

    def __init__(self,n_estimators=100, max_samples=256):
        self.model = IsolationForest(n_estimators, max_samples)
        self.threshold = 0.6 ## Recommended threshold in IForest paper.
        self.trainedStatus = False
    
    def train(self, data):
        """ Trains the model with data.
        
        Parameters
        ----------
        data : array-like or sparse matrix, shape=(n_samples, n_features)
            The input samples used for training. 
            Use ``dtype=np.float32`` for maximum efficiency. Sparse matrices 
            are also supported, use sparse ``csc_matrix`` for maximum efficieny.

        Returns
        -------
        none
        """
        self.model.fit(data)
        self.trainingData = data
        self.trainedStatus = True
        
    def getAnomScore(self, data):
        """ Returns the anomaly score of data. 
        
        Parameters
        ----------
        data : array-like or sparse matrix, shape=(n_samples, n_features) or 
            single point. The input samples. Use ``dtype=np.float32`` for 
            maximum efficiency. Sparse matrices are also supported, use sparse
            ``csc_matrix`` for maximum efficieny.    

        Returns
        -------
        scores : array of shape (n_samples,)
            The anomaly score of the input samples.
            The lower, the more normal.
        """
        data = np.asarray(data) 
        if data.shape == (2,):     # Check if single point or if array of points
            data = data.reshape(1,-1)
        return self.model.predict(data)
        
    def setThreshold(self, data, percentile):
        """ Sets the anomaly threshold for the model.
        
        Parameters
        ----------
        data : array-like or sparse matrix, shape=(n_samples, n_features).
        The input samples. Use ``dtype=np.float32`` for 
            maximum efficiency. Sparse matrices are also supported, use sparse
            ``csc_matrix`` for maximum efficieny.
            
        Percentile : floating point number. The percentile point desired. 

        Returns
        -------
        none
        """
        scores = []
        for point in data:
            scores.append(self.getAnomScore(point))
        self.threshold = np.precentile(scores, percentile)
        
    def getTopPercent(self, data, N=1.0):
        """ Returns the top N percent of anomalies. Default is top 1 percent.
        
        Parameters
        ----------
        data : array-like or sparse matrix, shape=(n_samples, n_features).
        The input samples. Use ``dtype=np.float32`` for 
            maximum efficiency. Sparse matrices are also supported, use sparse
            ``csc_matrix`` for maximum efficieny.
            
        N : floating point number. The percentage point desired. 

        Returns
        -------
        scoreIndex : tuple of index location of the top N percent anomaly scores
        """
        scores = []
        for point in data:
            point = point.reshape(1,-1)
            scores.append(self.getAnomScore(point))
        thresh = np.percentile(scores, 100.00 - N)
        return np.where(scores >= thresh)
    
    def getThreshold(self):
        """ Returns the current threshold for the model.
        
        Parameters
        ----------
        none
        
        Returns
        -------
        threshold : the model's current threshold
        """
        return self.threshold
Esempio n. 5
0
 def __init__(self, n_estimators=100, max_samples=256):
     self.model = IsolationForest(n_estimators, max_samples)
     self.threshold = 0.6  ## Recommended threshold in IForest paper.
     self.trainedStatus = False
Esempio n. 6
0
class IFWrapper(object):
    """Wrapper class for the Isolation Forest model.

    Return the anomaly score of each sample with the IsolationForest algorithm

    IsolationForest consists in 'isolate' the observations by randomly
    selecting a feature and then randomly selecting a split value
    between the maximum and minimum values of the selected feature.

    Since recursive partitioning can be represented by a tree structure, the
    number of splitting required to isolate a point is equivalent to the path
    length from the root node to a terminating node.

    This path length, averaged among a forest of such random trees, is a
    measure of abnormality and our decision function.

    Indeed random partitioning produces noticeable shorter paths for anomalies.
    Hence, when a forest of random trees collectively produce shorter path
    lengths for some particular points, then they are highly likely to be
    anomalies.


    Parameters
    ----------
    n_estimators : int, optional (default=100)
        The number of base estimators in the ensemble.

    max_samples : int or float, optional (default=256)
        The number of samples to draw from X to train each base estimator.
            - If int, then draw `max_samples` samples.
            - If float, then draw `max_samples * X.shape[0]` samples.

    max_features : int or float, optional (default=1.0)
        The number of features to draw from X to train each base estimator.
            - If int, then draw `max_features` features.
            - If float, then draw `max_features * X.shape[1]` features.

    bootstrap : boolean, optional (default=True)
        Whether samples are drawn with replacement.

    n_jobs : integer, optional (default=1)
        The number of jobs to run in parallel for both `fit` and `predict`.
        If -1, then the number of jobs is set to the number of cores.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    verbose : int, optional (default=0)
        Controls the verbosity of the tree building process.


    Attributes
    ----------
    estimators_ : list of DecisionTreeClassifier
        The collection of fitted sub-estimators.

    estimators_samples_ : list of arrays
        The subset of drawn samples (i.e., the in-bag samples) for each base
        estimator.

    References
    ----------
    .. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
           Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
    .. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation-based
           anomaly detection." ACM Transactions on Knowledge Discovery from
           Data (TKDD) 6.1 (2012): 3.

    """
    def __init__(self, n_estimators=100, max_samples=256):
        self.model = IsolationForest(n_estimators, max_samples)
        self.threshold = 0.6  ## Recommended threshold in IForest paper.
        self.trainedStatus = False

    def train(self, data):
        """ Trains the model with data.
        
        Parameters
        ----------
        data : array-like or sparse matrix, shape=(n_samples, n_features)
            The input samples used for training. 
            Use ``dtype=np.float32`` for maximum efficiency. Sparse matrices 
            are also supported, use sparse ``csc_matrix`` for maximum efficieny.

        Returns
        -------
        none
        """
        self.model.fit(data)
        self.trainingData = data
        self.trainedStatus = True

    def getAnomScore(self, data):
        """ Returns the anomaly score of data. 
        
        Parameters
        ----------
        data : array-like or sparse matrix, shape=(n_samples, n_features) or 
            single point. The input samples. Use ``dtype=np.float32`` for 
            maximum efficiency. Sparse matrices are also supported, use sparse
            ``csc_matrix`` for maximum efficieny.    

        Returns
        -------
        scores : array of shape (n_samples,)
            The anomaly score of the input samples.
            The lower, the more normal.
        """
        data = np.asarray(data)
        if data.shape == (2, ):  # Check if single point or if array of points
            data = data.reshape(1, -1)
        return self.model.predict(data)

    def setThreshold(self, data, percentile):
        """ Sets the anomaly threshold for the model.
        
        Parameters
        ----------
        data : array-like or sparse matrix, shape=(n_samples, n_features).
        The input samples. Use ``dtype=np.float32`` for 
            maximum efficiency. Sparse matrices are also supported, use sparse
            ``csc_matrix`` for maximum efficieny.
            
        Percentile : floating point number. The percentile point desired. 

        Returns
        -------
        none
        """
        scores = []
        for point in data:
            scores.append(self.getAnomScore(point))
        self.threshold = np.precentile(scores, percentile)

    def getTopPercent(self, data, N=1.0):
        """ Returns the top N percent of anomalies. Default is top 1 percent.
        
        Parameters
        ----------
        data : array-like or sparse matrix, shape=(n_samples, n_features).
        The input samples. Use ``dtype=np.float32`` for 
            maximum efficiency. Sparse matrices are also supported, use sparse
            ``csc_matrix`` for maximum efficieny.
            
        N : floating point number. The percentage point desired. 

        Returns
        -------
        scoreIndex : tuple of index location of the top N percent anomaly scores
        """
        scores = []
        for point in data:
            point = point.reshape(1, -1)
            scores.append(self.getAnomScore(point))
        thresh = np.percentile(scores, 100.00 - N)
        return np.where(scores >= thresh)

    def getThreshold(self):
        """ Returns the current threshold for the model.
        
        Parameters
        ----------
        none
        
        Returns
        -------
        threshold : the model's current threshold
        """
        return self.threshold