Exemple #1
0
class AdaptiveTree(object):
    def __init__(self,
                 tree,
                 kappa_window,
                 warning_delta,
                 drift_delta,
                 tree_pool_id=-1):
        self.tree_pool_id = tree_pool_id
        self.tree = tree
        self.bg_adaptive_tree = None
        self.is_candidate = False
        self.warning_detector = ADWIN(warning_delta)
        self.drift_detector = ADWIN(drift_delta)
        self.predicted_labels = deque(maxlen=kappa_window)
        self.kappa = -sys.maxsize
        self.kappa_window = kappa_window

    def update_kappa(self, actual_labels):
        if len(self.predicted_labels) < self.kappa_window:
            self.kappa = -sys.maxsize
        else:
            self.kappa = cohen_kappa_score(actual_labels, self.predicted_labels)
        return self.kappa

    def reset(self):
        self.bg_adaptive_tree = None
        self.is_candidate = False
        self.warning_detector.reset()
        self.drift_detector.reset()
        self.predicted_labels.clear()
        self.kappa = -sys.maxsize
Exemple #2
0
class SADWINIsolationForestStream(BaseSKMObject, ClassifierMixin):
    """
  This code implements  Anomaly Detection Approach Based on Isolation Forest Algorithm for Streaming Data Using Sliding Window (Ding \& Fei, 2013) [3]

    Each sample has an anomaly score is computed based on Isolation Forest anomaly based approach [2]. The concept of Isolation forest [1]
    consists on  isolating observations by randomly selecting a feature
    and then randomly selecting a split value between the maximum and minimum
    values of the selected feature.
    
    Model is updated of a Drift has been detected based on a input drift threshold. The drift detection approach is proposed by [2] 
    and works as follow : if the averaged anomaly score between two successive sliding windows is highter than the drift threshold (u), 
    then the previous model is completely discarded and a new model is build as an isolation forest on latest sliding windows stream.


  Parameters

    ---------

    n_estimators: int, optional (default=25)

       Number of trees in the ensemble.

       't' in the original paper.



    window_size: int, optional (default=100)

        The window size of the stream.

        ψ, 'Psi' in the original paper.   

## Optional       

    anomaly_threshold: double, optional (default=0.5)

        The threshold for declaring anomalies.

        Any instance prediction probability above this threshold will be declared as an anomaly.

    drift_threshold: double, optional (default=0.5)

        The threshold for detecting Drift and update the model.

       If the averaged anomaly score between two successive sliding windows is highter than the threshold (u), 
    then the previous model is completely discarded and a new model is build as an isolation forest on latest sliding windows stream.
    This parameters is supposed to be know by an expert domain, depending on data set.

## Other Attributes

    ensemble : Isolation Tree Ensemble

        Contain an Isolation Tree Ensemble object, current model for   IsolationForestStream

    sample_size : int

        Number of sample seen since the update

    anomaly_rate : float

        Rate of the anomalies in the previous sliding window (AnomalyRate in the original paper iForestASD)

    prec_window & window : numpy.ndarray of shape (n_samples, self.window_size)

        The previous and current window of data

    cpt : int

        Counter, if the n_estimator is higher than its, it will fit

    References
    ----------

    [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua.        
“Isolation forest.” Data Mining, 2008. ICDM’08. Eighth IEEE International Conference on.

    [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. “Isolation-based anomaly detection.” ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 
self.n_estimators

    [3] Ding, Zhiguo. (2013) An Anomaly Detection Approach Based on Isolation Forest Algorithm for Streaming Data Using Sliding Window. 12-17. 10.3182/20130902-3-CN-3020.00044. 

    """
    def __init__(
            self,
            window_size=100,
            n_estimators=25,
            anomaly_threshold=0.5,
            drift_threshold=0.5,
            random_state=None,
            version="AnomalyRate",
            #Parameters for partial model update
            n_estimators_updated=0.5,
            updated_randomly=True,
            #Parameters for NDKSWIN
            alpha=0.01,
            data=None,
            n_dimensions=1,
            n_tested_samples=0.1,
            fixed_checked_dimension=False,
            fixed_checked_sample=False):

        super().__init__()

        self.n_estimators = n_estimators

        self.ensemble = None

        self.random_state = random_state

        self.window_size = window_size

        self.samples_seen = 0

        self.anomaly_rate = 0.20

        self.anomaly_threshold = anomaly_threshold

        self.drift_threshold = drift_threshold

        self.window = None

        self.prec_window = None

        self.cpt = 0
        self.version = version
        self.model_update = [
        ]  #To count the number of times the model have been updated 0 Not updated and 1 updated
        self.model_update_windows = [
        ]  #To count the number of times the model have been updated 0 Not updated and 1 updated
        self.model_update.append(
            version
        )  #Initialisation to know the concerned version of IForestASD
        self.model_update_windows.append(
            "samples_seen_" + version
        )  #Initialisation to know the number of data seen in the window
        self.n_estimators_updated = int(
            self.n_estimators * n_estimators_updated
        )  # The percentage of new trees to compute when update on new window
        if n_estimators_updated <= 0.0 or n_estimators_updated > 1.0:
            raise ValueError("n_estimators_updated must be > 0 and <= 1")

        self.updated_randomly = updated_randomly  # If we will choose randomly the trees: True for randomly,
        # False to pick the first (n_estimators- int(n_estimators*n_estimators_updated)) trees

        self.alpha = alpha
        self.n_dimensions = n_dimensions
        self.n_tested_samples = n_tested_samples
        self.fixed_checked_dimension = fixed_checked_dimension
        self.fixed_checked_sample = fixed_checked_sample
        self.first_time_fit = True

        # TODO Maurras 27112020: Find a way to optimize the use of ADWIN()
        self.adwin = ADWIN()

    def partial_fit(self, X, y, classes=None, sample_weight=None):
        """ Partially (incrementally) fit the model.
          Parameters
          ----------
          X : numpy.ndarray of shape (n_samples, n_features)
              The features to train the model.
          y: numpy.ndarray of shape (n_samples)
              An array-like with the class labels of all samples in X.
          classes: None
              Not used by this method.
          sample_weight: None
              Not used by this method.
          Returns
          -------
          self
          """

        ## get the number of observations
        number_instances, _ = X.shape

        if (self.samples_seen == 0):
            ## ToDo ? Give a sample of self.window_size in attribute of iForest
            iforest = IsolationTreeEnsemble(self.window_size,
                                            self.n_estimators,
                                            self.random_state)
            self.ensemble = iforest

        for i in range(number_instances):
            self._partial_fit(X[i], y[i])

        return self

    def _partial_fit(self, X, y):
        """ Trains the model on samples X and corresponding targets y.
          Private function where actual training is carried on.
          Parameters
          ----------
          X: numpy.ndarray of shape (1, n_features)
              Instance attributes.
          y: int
              Class label for sample X. Not used in this implementaion which is Unsupervised
          """
        """
          Reshape X and add it to our window if it isn't full.
          If it's full, give window to our precedent_window.
          If we are at the end our window, fit if we're learning 
          Check the anomaly score of our window 
          Update if self.anomaly_rate > self.drift_threshold

          """
        X = np.reshape(X, (1, len(X)))

        if self.samples_seen % self.window_size == 0:
            ## Update the two windows (precedent one and current windows)
            self.prec_window = self.window
            self.window = X
        else:
            self.window = np.concatenate((self.window, X))

        if self.samples_seen % self.window_size == 0 and self.samples_seen != 0:

            #Fit the ensemble if it's not empty
            #if(self.cpt<self.n_estimators):
            #  self.ensemble.fit(self.prec_window)
            #  self.cpt += 1
            if self.first_time_fit:  #It is the first window
                self.ensemble.fit(self.prec_window)
                self.first_time_fit = False

            elif (self.version == "SADWIN"):
                #if self.first_time_fit:
                #    from skmultiflow.drift_detection.adwin import ADWIN
                #    adwin = ADWIN()
                #    self.first_time_fit = False
                #print('start sadwin version')
                #TODO MAJ Maurras 04112020 : Modify the way to detect the concept drift using the ADWIN() function availlable in scikitMultiflow
                #from skmultiflow.drift_detection.adwin import ADWIN
                #adwin = ADWIN()
                prec_window_scores = self.ensemble.anomaly_score(
                    self.prec_window)
                #print('Before  add element to adwin in SADWIN')
                #print(prec_window_scores)
                drift_detected = False
                #ind = 0
                for score in prec_window_scores:
                    #adwin.add_element(prec_window_scores)
                    #print("added score = "+ str(score) + " on index = "+ str(ind))
                    #print('score[0]')
                    #print(score[0])
                    #print('score')
                    #print(score)
                    self.adwin.add_element(score[0])
                    #print('start change detection')
                    if self.adwin.detected_change():
                        #print('Change detected SADWIN')
                        drift_detected = True
                        #print("Index = "+str(i) +" of the window with data "+ str(self.prec_window[i]))
                        break
                    #ind = ind + 1
                if (drift_detected):
                    #print('start model updating')
                    self.model_update.append(1)
                    self.model_update_windows.append(self.samples_seen)
                    self.update_model(self.prec_window)
                    self.adwin.reset()
                else:
                    self.model_update.append(0)
                    self.model_update_windows.append(self.samples_seen)

        self.samples_seen += 1

    def update_model(self, window):
        """ Update the model (fit a new isolation forest) if the current anomaly rate (in the previous sliding window)
     is higher than self.drift_threshold
        Parameters: 
          window: numpy.ndarray of shape (self.window_size, n_features)
        Re-Initialize our attributes and our ensemble, fit with the current window

    """

        ## ToDo ? Give a sample of self.window_size in attribute of iForest
        #MAJ Maurras 03112020 : No, Leave it like that. Must give all the window to tt construct the forest of itrees.
        self.is_learning_phase_on = True
        iforest = IsolationTreeEnsemble(self.window_size, self.n_estimators,
                                        self.random_state)
        self.ensemble = iforest
        self.ensemble.fit(window)
        #self.nb_update = self.nb_update + 1
        print("")
        print(
            "The model was updated by training a new iForest with the version : "
            + self.version)

    def anomaly_scores_rate(self, window):
        """
    Given a 2D matrix of observations, compute the anomaly rate 
    for all instances in the window and return an anomaly rate of the given window.

    Parameters :
    window: numpy.ndarray of shape (self.window_size, n_features)
    """

        score_tab = 2.0**(-1.0 * self.ensemble.path_length(window) /
                          c(len(window)))
        score = 0
        for x in score_tab:
            if x > self.anomaly_threshold:
                score += 1
        return score / len(score_tab)

    '''
      MAJ : 21112020
      By : Maurras
      Add new function to classify instances (anomaly or normal)
  '''

    def predict_simple(self, X):
        """
    Given a window, Predict the instance class (1 or 0) by using predict_from_instances_scores on our model

    """
        #print('predict_simple')
        prediction = self.ensemble.predict_from_instances_scores(
            self.ensemble.anomaly_score(X),
            self.anomaly_threshold)  ## return prediction of all instances

        #print('end predict_simple')
        return prediction

    def predict(self, X):
        """
    Given an instance, Predict the anomaly (1 or 0) based on the last sample of the window by using predict_proba if our model have fit, 
    else return None

    """
        if (self.samples_seen <= self.window_size):

            return [-1]  ## Return the last element

        X = np.reshape(X, (1, len(X[0])))
        self.prec_window = np.concatenate(
            (self.prec_window,
             X))  ## Append the instances in the sliding window

        prediction = self.ensemble.predict_from_anomaly_scores(
            self.predict_proba(self.prec_window),
            self.anomaly_threshold)  ## return 0 or 1

        return [prediction]

    def predict_proba(self, X):
        """
    Calculate the anomaly score of the window if our model have fit, else return None
    Parameters :
    X: numpy.ndarray of shape (self.window_size, n_features)   

    """
        if (self.samples_seen <= self.window_size):
            return [-1]
        return self.ensemble.anomaly_score(
            self.prec_window
        )[-1]  # Anomaly return an array with all scores of each data, taking -1 return the last instance (X) anomaly score
    temp_drifts = []

    df_results = pd.DataFrame({
        'y_true': results_dict['y_true'][-1],
        'y_pred': results_dict['Predictions'][-1]
    })
    df_results['Correct'] = (df_results['y_true'] == df_results['y_pred'])

    for i in range(df_results.shape[0]):
        adwin.add_element(df_results['Correct'].iloc[i])
        if adwin.detected_change():
            print('Change detected ADWIN in data: ' +
                  str(df_results['Correct'].iloc[i]) + ' - at date: ' +
                  str(results_dict['Date'][-1].iloc[i]))
            temp_drifts.append(results_dict['Date'][-1].iloc[i])
            adwin.reset()

    if not temp_drifts:
        print('No Drift Detected - Predict next three months')
        start_test_date = start_test_date + pd.DateOffset(months=3)
        training_flag = False
        update_flag = False

    if temp_drifts:
        print('Drift detected - Choice on model')
        list_drift.append(temp_drifts[0])
        start_train_date = temp_drifts[0] - pd.DateOffset(years=2)
        start_test_date = start_train_date + pd.DateOffset(years=2)

        if (temp_drifts[0] - datetime.date(
                xgboost_model.results['Training'][-1])) > timedelta(days=365):