Beispiel #1
0
def find_outliers_sliding_window(self,\
                                 threshold=3,in_place=False,verbose=True, \
                                 periods=[[]],excluded_periods=[[]], component='NE',window_len=15,automatic=True):
    ###############################################################################
    """
    Find outliers using sliding windows
    """

    lindex_north = []
    lindex_east = []
    lindex_up = []

    if self.data.shape[0] > window_len:

        itermax = 5

        lindex_north = []
        lindex_east = []
        lindex_up = []

        OK = True
        loutliers = []
        loutliers_dates = []
        i = 0

        smooth = self.extract_periods(periods).exclude_periods(
            excluded_periods).smooth(window_len=window_len)
        new_ts = self.extract_periods(periods).exclude_periods(
            excluded_periods)
        residual_ts = self.extract_periods(periods).exclude_periods(
            excluded_periods)
        residual_ts.data[:, 1:4] = new_ts.data[:, 1:4] - smooth.data[:, 1:4]

        diff_data = np.diff(self.data[:, 1:4], n=1, axis=0)
        [median_north, median_east, median_up] = np.median(np.abs(diff_data),
                                                           axis=0)

        while OK:

            if 'N' in component:
                lindex_north = np.where(
                    np.abs(residual_ts.data[:, 1]) > threshold *
                    median_north)[0].tolist()
            if 'E' in component:
                lindex_east = np.where(
                    np.abs(residual_ts.data[:, 2]) > threshold *
                    median_east)[0].tolist()
            if 'U' in component:
                lindex_up = np.where(
                    np.abs(residual_ts.data[:, 3]) > threshold *
                    median_up)[0].tolist()

            loutliers = list(set(lindex_north + lindex_east + lindex_up))

            if verbose:
                print((
                    "-- Outliers detection pass #%02d : %03d new outliers detected"
                    % (i, len(loutliers))))

            #print loutliers_dates,new_ts.data[loutliers,0].tolist()
            loutliers_dates += new_ts.data[loutliers, 0].tolist()

            if loutliers == []: OK = False

            i += 1
            if i > itermax: OK = False

            smooth = self.extract_periods(periods).exclude_periods(
                [[]]).smooth(window_len=window_len)

            new_ts.outliers = loutliers
            new_ts = new_ts.remove_outliers()

            smooth = new_ts.smooth(window_len=window_len)

            residual_ts = new_ts.copy()
            residual_ts.data[:,
                             1:4] = new_ts.data[:, 1:4] - smooth.data[:, 1:4]

            diff_data = np.diff(self.data[:, 1:4], n=1, axis=0)
            [median_north, median_east,
             median_up] = np.median(np.abs(diff_data), axis=0)

        if verbose: print("-- ", len(loutliers_dates), " outliers found")

        loutliers_index = get_index_from_dates(loutliers_dates,
                                               self.data,
                                               tol=0.25)

    else:
        loutliers_index = self.outliers

    new_Gts = self.copy()

    if in_place:
        self.outliers = loutliers_index
        return (self)
        del new_Gts
    else:
        new_Gts = self.copy()
        new_Gts.outliers = loutliers_index
        return (new_Gts)
def find_outliers_around_date(self,
                              date,
                              conf_level=95,
                              n=3,
                              lcomponent='NE',
                              verbose=True):
    ###################################################################
    """
    Find an outlier around a given date
    returns the index of the outlier, returns [] if no outlier found
    :param date       : given date
    :param conf_level : confidence level for F_ratio test of outlier significance (default 95%%)
    :param n          : number of dates either sides of date (default n=3)
    :param lcomponent : components 'N','E','U','NE','NEU' (default 'NE')
    """

    # import
    import numpy as np
    from pyacs.gts.Gts import get_index_from_dates

    if verbose:
        print((
            "-- Searching outlier around date %10.5lf on components %s with confidence level %6.1lf and %02d samples"
            % (date, lcomponent, conf_level, 2 * n)))

    # self.info()
    tmp_gts = self.detrend().remove_outliers().extract_ndates_around_date(
        date, n)
    nn = tmp_gts.data.shape[0]

    score = {}
    llindex = {}

    # F_ratio test
    ###############################################
    def f_ratio(chi_square_1, p1, chi_square_2, p2, n):
        ###############################################
        """
        returns result of a F_ratio test
        """
        F = ((chi_square_1 - chi_square_2) / (p2 - p1)) / (chi_square_2 /
                                                           (n - p2))

        from scipy.stats import f
        return (f.cdf(F, p2 - p1, n - p2))

    #
    H_component = {1: 'North', 2: 'East', 3: 'Up'}
    # find outlier
    li = []
    if 'N' in lcomponent: li.append(1)
    if 'E' in lcomponent: li.append(2)
    if 'U' in lcomponent: li.append(3)

    for i in sorted(li):
        #        if verbose:
        #            if i==1:print "  => Testing component: North"
        #            if i==2:print "  => Testing component: East"
        #            if i==3:print "  => Testing component: Up"

        index = np.where(
            np.abs(tmp_gts.data[:, i] - np.median(tmp_gts.data[:, i])) ==
            np.max(np.abs(tmp_gts.data[:, i] - np.median(tmp_gts.data[:, i]))))

        if verbose:
            print(("-- suspected outlier at date %10.4lf on component %s " %
                   (tmp_gts.data[index, 0][0], H_component[i])))

        tmp_gts_no_outlier = tmp_gts.copy()
        tmp_gts_no_outlier.outliers = [index]
        tmp_gts_no_outlier.remove_outliers(in_place=True)

        chi_square_1 = nn * np.std(tmp_gts.data[:, i])**2
        chi_square_2 = (nn - 1) * np.std(tmp_gts_no_outlier.data[:, i])**2

        score[i] = f_ratio(chi_square_1, 1, chi_square_2, 2, 2 * n) * 100.0
        print(("-- probability of outlier (F_ratio) %5.2lf%% " % (score[i])))
        llindex[i] = index

    # make decision

    if np.max(list(score.values())) < conf_level:
        if verbose: print("-- No significant outlier found")
        return (self)
    else:
        # choose the outlier as the maximum probability
        component = li[0]
        current_score = score[component]
        del li[0]
        for i in li:
            if score[i] > current_score:
                current_score = score[i]
                component = i

        date = tmp_gts.data[llindex[component], 0]
        # return the index in the original time series
        if verbose:
            print("=> Getting index for date ", date)
        returned_index = get_index_from_dates([date], self.data, tol=0.25)
        self.outliers += returned_index

        return (self)
Beispiel #3
0
def find_outliers_vondrak(self, threshold=10, fc=2., in_place=False,verbose=True, \
                                 periods=[[]],excluded_periods=[[]], component='NE'):
    ###############################################################################
    """
    Find outliers using a Vondrak filter
    """
    # init
    loutliers_dates = []
    lindex_north = []
    lindex_east = []
    lindex_up = []

    # keep selected period
    tmp_ts = self.extract_periods(periods).exclude_periods(
        excluded_periods).detrend()

    # calculates vondrak filter
    vondrak_ts = tmp_ts.vondrak(fc, component=component, verbose=verbose)

    # calculates the residual time series

    residual_ts = tmp_ts.copy()

    residual_ts.data[:, 1] = tmp_ts.data[:, 1] - vondrak_ts.data[:, 1]
    residual_ts.data[:, 2] = tmp_ts.data[:, 2] - vondrak_ts.data[:, 2]
    residual_ts.data[:, 3] = tmp_ts.data[:, 3] - vondrak_ts.data[:, 3]

    # get the median
    [median_north, median_east,
     median_up] = np.median(np.abs(residual_ts.data[:, 1:4]), axis=0)

    # get the outliers
    if 'N' in component:
        lindex_north = np.where(
            np.abs(residual_ts.data[:, 1]) > threshold *
            median_north)[0].tolist()
    if 'E' in component:
        lindex_east = np.where(
            np.abs(residual_ts.data[:,
                                    2]) > threshold * median_east)[0].tolist()
    if 'U' in component:
        lindex_up = np.where(
            np.abs(residual_ts.data[:,
                                    3]) > threshold * median_up)[0].tolist()

    loutliers = list(set(lindex_north + lindex_east + lindex_up))

    if verbose:
        print((
            "-- Outliers detection using Vondrak filter with fc=%.2lf : %03d new outliers detected"
            % (fc, len(loutliers))))

    # get the outliers dates
    loutliers_dates += tmp_ts.data[loutliers, 0].tolist()

    # get outliers index in original time series
    if loutliers != []:
        loutliers_index = get_index_from_dates(loutliers_dates,
                                               self.data,
                                               tol=0.25)

    else:
        loutliers_index = self.outliers

    # return
    new_Gts = self.copy()

    if in_place:
        self.outliers = loutliers_index
        return (self)
        del new_Gts
    else:
        new_Gts = self.copy()
        new_Gts.outliers = loutliers_index
        return (new_Gts)