Example #1
0
class IForest():
    """Different IsolationForest implementations adapted to be able to work with FDataGrid objects 
        from scikit-fda library.
    Arguments :
        - params : parameters to initialize the model
        - functional : either to use a functional implementation of the algorithm
        - contamination : contamination level of the dataset
    Attributes :
        - fit : fit the outlier detection model to a training data
        - predict : predict the labels for a testing data
        - score_sampels : returns the anomaly scores for a testing data
        - eval_performances : computes classification metrics to evaluate the model in a testing data
        - plot_detection : plots the results of the outlier detection"""

    def __init__(self, contamination, functional: bool = False, **params):
        self.params = params
        self.functional = functional
        self.contamination = contamination
        self.is_scored = False
        if functional == False :
            # In a non functional context, we use IsolationForest from scikit-learn
            self.model = IsolationForest(**self.params, contamination=self.contamination)

    def fit(self, fd_train: FDataGrid):
        if self.functional == False :
            if fd_train.dim_codomain > 1 :
                # multivariate functional data cannot be fed to this model
                raise ValueError("Functional Data must be univariate")
            else :
                self.model.fit(fd_train.data_matrix[...,0])
        else :
            if fd_train.dim_codomain == 1 :
                # univariate functional data
                self.model = FIF.FIForest(fd_train.data_matrix[...,0], time=fd_train.sample_points[0], 
                                            innerproduct='auto', **self.params)
            else :
                # multivariate functional data
                self.model = MFIF.MFIForest(np.transpose(fd_train.data_matrix, axes=(0,2,1)), 
                                            time=fd_train.sample_points[0], innerproduct='auto1', **self.params)
    
    def predict(self, fd_test: FDataGrid):
        """Predicts the labels in a given testing set
        Arguments :
            - fd_test : FDataGrid 
        Returns :
            - y_pred : predicted labels. Either 1 or -1 if the sample is considered as 
                an inlier (1) or outlier (-1)"""

        self.fd_test = fd_test
        if self.functional == False :
            if fd_test.dim_codomain > 1 :
                raise ValueError("Functional Data must be univariate")
            else :
                self.y_pred = self.model.predict(fd_test.data_matrix[...,0])
        else :
            if fd_test.dim_codomain == 1 :
                self.scores = self.model.compute_paths(fd_test.data_matrix[...,0])
                self.y_pred = self.model.predict_label(self.scores, contamination=self.contamination)
            else :
                self.scores = self.model.compute_paths(np.transpose(fd_test.data_matrix, axes=(0,2,1)))
                self.y_pred = self.model.predict_label(self.scores, contamination=self.contamination)
            self.is_scored = True
        return self.y_pred

    def score_samples(self, fd_test: FDataGrid, return_threshold: bool = False):
        """Returns the anomaly scores of samples in a testing set
        Arguments :
            - fd_test : FDataGrid
            - return_threshold : either to return the value of the threshold based on the contamination level
        Returns :
            - scores : np.array of the anomaly scores of each sample in fd_test"""

        self.fd_test = fd_test
        if self.is_scored :
            # scores have been already computed (in functional context, we use the scores to predict the labels)
            if return_threshold == True :
                return self.scores, np.percentile(self.scores, 100 * (1-self.contamination))
            else :
                return self.scores
        else :
            if self.functional == False :
                if fd_test.dim_codomain > 1 :
                    raise ValueError("Functional Data must be univariate")
                else :
                    self.scores = - self.model.score_samples(fd_test.data_matrix[...,0])
            else :
                if fd_test.dim_codomain == 1 :
                    self.scores = self.model.compute_paths(fd_test.data_matrix[...,0])
                else :
                    self.scores = self.model.compute_paths(np.transpose(fd_test.data_matrix, axes=(0,2,1)))
            if return_threshold == True :
                return self.scores, np.percentile(self.scores, 100 * (1-self.contamination))
            else :
                return self.scores
            self.is_scored = True

    def eval_performances(self, fd_test: FDataGrid, y_test: np.array):
        """Evaluate the performances of the model in a given testing set. 
        Uses the function _evaluate defined as the beginning of the module"""

        if hasattr(self, 'fd_test') and self.fd_test == fd_test :
            # prediction or scoring has already been done using this testing set
            if self.is_scored :
                # scoring has been done in this testing set
                if not hasattr(self, 'y_pred') :
                    # only scoring has been done
                    self.y_pred = self.predict(fd_test)
            else :
                # only prediction has been done (can be only non-functional context here)
                assert self.functional == False
                self.scores = self.score_samples(fd_test)
        else :
            # neither has been done
            self.scores = self.score_samples(fd_test)
            self.y_pred = self.predict(fd_test)

        return _evaluate(self.scores, self.y_pred, y_test)

    def plot_detection(self, fd_test: FDataGrid, plot_interaction: bool = False) :
        if hasattr(self, 'fd_test') and self.fd_test == fd_test :
            # prediction or scoring have already been done in this testing set
            if not hasattr(self, 'y_pred') :
                # predictions haven't been done in this testing set
                self.y_pred = self.predict(fd_test)
        else :
            self.y_pred = self.predict(fd_test)
        curve_analysis = fda_feature.CurveAnalysis(fd_test)
        if not plot_interaction :
            targets = np.array(self.y_pred, dtype='int')
            targets[targets==-1] = 0
            curve_analysis.plot_grids(targets=targets, target_names=["outlier","inlier"])
        else : 
            targets = np.array(self.y_pred, dtype='int')
            targets[targets==-1] = 0
            curve_analysis.plot_interaction(targets=targets, target_names=["outlier","inlier"])

    def plot_scores(self, fd_test: FDataGrid, targets=None, target_names=None):
        
        # score the samples and get the threshold value
        self.scores, self.threshold = self.score_samples(fd_test, return_threshold=True)

        order = np.argsort(self.scores)
        ranks = np.argsort(order)
        S_sort = np.sort(self.scores)
        if targets is not None :
            n_targets = len(target_names)
            col_map = [cm.jet(i) for i in np.linspace(0, 1, n_targets)]
            colors = {t : col_map[t] for t in targets}
            for i in range(n_targets):
                plt.scatter(ranks[np.where(targets==i)[0]], S_sort[ranks[np.where(targets==i)[0]]],
                                color=colors[i])
            for k in range(n_targets):
                plt.plot([], [], color=col_map[k], label=target_names[k])
        else :
            plt.scatter(range(len(self.scores)), S_sort, color="grey")
        
        plt.hlines(y=self.threshold, xmin=0, xmax=len(self.scores), linestyle='dashed', label="threshold")
        plt.legend(loc='best')
        plt.title("Anomaly scores of the Test curves with outliers' colored")
        plt.xlabel("Index of sorted curves")
        plt.ylabel("Scores")
        plt.show()