def find_outliers_sliding_window(self,\ threshold=3,in_place=False,verbose=True, \ periods=[[]],excluded_periods=[[]], component='NE',window_len=15,automatic=True): ############################################################################### """ Find outliers using sliding windows """ lindex_north = [] lindex_east = [] lindex_up = [] if self.data.shape[0] > window_len: itermax = 5 lindex_north = [] lindex_east = [] lindex_up = [] OK = True loutliers = [] loutliers_dates = [] i = 0 smooth = self.extract_periods(periods).exclude_periods( excluded_periods).smooth(window_len=window_len) new_ts = self.extract_periods(periods).exclude_periods( excluded_periods) residual_ts = self.extract_periods(periods).exclude_periods( excluded_periods) residual_ts.data[:, 1:4] = new_ts.data[:, 1:4] - smooth.data[:, 1:4] diff_data = np.diff(self.data[:, 1:4], n=1, axis=0) [median_north, median_east, median_up] = np.median(np.abs(diff_data), axis=0) while OK: if 'N' in component: lindex_north = np.where( np.abs(residual_ts.data[:, 1]) > threshold * median_north)[0].tolist() if 'E' in component: lindex_east = np.where( np.abs(residual_ts.data[:, 2]) > threshold * median_east)[0].tolist() if 'U' in component: lindex_up = np.where( np.abs(residual_ts.data[:, 3]) > threshold * median_up)[0].tolist() loutliers = list(set(lindex_north + lindex_east + lindex_up)) if verbose: print(( "-- Outliers detection pass #%02d : %03d new outliers detected" % (i, len(loutliers)))) #print loutliers_dates,new_ts.data[loutliers,0].tolist() loutliers_dates += new_ts.data[loutliers, 0].tolist() if loutliers == []: OK = False i += 1 if i > itermax: OK = False smooth = self.extract_periods(periods).exclude_periods( [[]]).smooth(window_len=window_len) new_ts.outliers = loutliers new_ts = new_ts.remove_outliers() smooth = new_ts.smooth(window_len=window_len) residual_ts = new_ts.copy() residual_ts.data[:, 1:4] = new_ts.data[:, 1:4] - smooth.data[:, 1:4] diff_data = np.diff(self.data[:, 1:4], n=1, axis=0) [median_north, median_east, median_up] = np.median(np.abs(diff_data), axis=0) if verbose: print("-- ", len(loutliers_dates), " outliers found") loutliers_index = get_index_from_dates(loutliers_dates, self.data, tol=0.25) else: loutliers_index = self.outliers new_Gts = self.copy() if in_place: self.outliers = loutliers_index return (self) del new_Gts else: new_Gts = self.copy() new_Gts.outliers = loutliers_index return (new_Gts)
def find_outliers_around_date(self, date, conf_level=95, n=3, lcomponent='NE', verbose=True): ################################################################### """ Find an outlier around a given date returns the index of the outlier, returns [] if no outlier found :param date : given date :param conf_level : confidence level for F_ratio test of outlier significance (default 95%%) :param n : number of dates either sides of date (default n=3) :param lcomponent : components 'N','E','U','NE','NEU' (default 'NE') """ # import import numpy as np from pyacs.gts.Gts import get_index_from_dates if verbose: print(( "-- Searching outlier around date %10.5lf on components %s with confidence level %6.1lf and %02d samples" % (date, lcomponent, conf_level, 2 * n))) # self.info() tmp_gts = self.detrend().remove_outliers().extract_ndates_around_date( date, n) nn = tmp_gts.data.shape[0] score = {} llindex = {} # F_ratio test ############################################### def f_ratio(chi_square_1, p1, chi_square_2, p2, n): ############################################### """ returns result of a F_ratio test """ F = ((chi_square_1 - chi_square_2) / (p2 - p1)) / (chi_square_2 / (n - p2)) from scipy.stats import f return (f.cdf(F, p2 - p1, n - p2)) # H_component = {1: 'North', 2: 'East', 3: 'Up'} # find outlier li = [] if 'N' in lcomponent: li.append(1) if 'E' in lcomponent: li.append(2) if 'U' in lcomponent: li.append(3) for i in sorted(li): # if verbose: # if i==1:print " => Testing component: North" # if i==2:print " => Testing component: East" # if i==3:print " => Testing component: Up" index = np.where( np.abs(tmp_gts.data[:, i] - np.median(tmp_gts.data[:, i])) == np.max(np.abs(tmp_gts.data[:, i] - np.median(tmp_gts.data[:, i])))) if verbose: print(("-- suspected outlier at date %10.4lf on component %s " % (tmp_gts.data[index, 0][0], H_component[i]))) tmp_gts_no_outlier = tmp_gts.copy() tmp_gts_no_outlier.outliers = [index] tmp_gts_no_outlier.remove_outliers(in_place=True) chi_square_1 = nn * np.std(tmp_gts.data[:, i])**2 chi_square_2 = (nn - 1) * np.std(tmp_gts_no_outlier.data[:, i])**2 score[i] = f_ratio(chi_square_1, 1, chi_square_2, 2, 2 * n) * 100.0 print(("-- probability of outlier (F_ratio) %5.2lf%% " % (score[i]))) llindex[i] = index # make decision if np.max(list(score.values())) < conf_level: if verbose: print("-- No significant outlier found") return (self) else: # choose the outlier as the maximum probability component = li[0] current_score = score[component] del li[0] for i in li: if score[i] > current_score: current_score = score[i] component = i date = tmp_gts.data[llindex[component], 0] # return the index in the original time series if verbose: print("=> Getting index for date ", date) returned_index = get_index_from_dates([date], self.data, tol=0.25) self.outliers += returned_index return (self)
def find_outliers_vondrak(self, threshold=10, fc=2., in_place=False,verbose=True, \ periods=[[]],excluded_periods=[[]], component='NE'): ############################################################################### """ Find outliers using a Vondrak filter """ # init loutliers_dates = [] lindex_north = [] lindex_east = [] lindex_up = [] # keep selected period tmp_ts = self.extract_periods(periods).exclude_periods( excluded_periods).detrend() # calculates vondrak filter vondrak_ts = tmp_ts.vondrak(fc, component=component, verbose=verbose) # calculates the residual time series residual_ts = tmp_ts.copy() residual_ts.data[:, 1] = tmp_ts.data[:, 1] - vondrak_ts.data[:, 1] residual_ts.data[:, 2] = tmp_ts.data[:, 2] - vondrak_ts.data[:, 2] residual_ts.data[:, 3] = tmp_ts.data[:, 3] - vondrak_ts.data[:, 3] # get the median [median_north, median_east, median_up] = np.median(np.abs(residual_ts.data[:, 1:4]), axis=0) # get the outliers if 'N' in component: lindex_north = np.where( np.abs(residual_ts.data[:, 1]) > threshold * median_north)[0].tolist() if 'E' in component: lindex_east = np.where( np.abs(residual_ts.data[:, 2]) > threshold * median_east)[0].tolist() if 'U' in component: lindex_up = np.where( np.abs(residual_ts.data[:, 3]) > threshold * median_up)[0].tolist() loutliers = list(set(lindex_north + lindex_east + lindex_up)) if verbose: print(( "-- Outliers detection using Vondrak filter with fc=%.2lf : %03d new outliers detected" % (fc, len(loutliers)))) # get the outliers dates loutliers_dates += tmp_ts.data[loutliers, 0].tolist() # get outliers index in original time series if loutliers != []: loutliers_index = get_index_from_dates(loutliers_dates, self.data, tol=0.25) else: loutliers_index = self.outliers # return new_Gts = self.copy() if in_place: self.outliers = loutliers_index return (self) del new_Gts else: new_Gts = self.copy() new_Gts.outliers = loutliers_index return (new_Gts)