def detect_anomalies(ary, freq, p): """Use Seasonal Decompose and ESD on residual to detect anomalies.""" if len(ary) < freq * 2: logger.warn('Only {} events for frequency of {}. Must be at least {} events.'.format(len(ary), freq, freq * 2)) logger.warn('Generate empty response.') return [], 0 # Seasonal decompose model = sm.tsa.seasonal_decompose(ary, freq=freq) # We only use residue values here resid = model.resid # Count leading Nan, which are going to be dropped for ESD # but have to be taken into account for the results dropped = 0 for val in resid: if np.isnan(val): dropped += 1 else: break # Remove NaNs, as ESD doesn't handle them well resid_cleaned = [x for x in resid if not np.isnan(x)] # Use ESD to detect anomalies anomalies = pyasl.generalizedESD(resid_cleaned, 20, p) # Get the indexed of the anomalies (index of resid_cleaned + dropped) idx_anoms = [x + dropped for x in anomalies[1]] # Convert np.int64 to standard int to make it serializable in json idx_anoms = [x.item() for x in idx_anoms] return idx_anoms, len(resid_cleaned)
def sanity_ESDexample(self): """ Check the example for the generalized ESD """ import numpy as np import matplotlib.pylab as plt from PyAstronomy import pyasl # Convert data given at: # http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm # to array. x = np.array(map(lambda x: float(x), "-0.25 0.68 0.94 1.15 1.20 1.26 1.26 1.34 1.38 1.43 1.49 1.49 \ 1.55 1.56 1.58 1.65 1.69 1.70 1.76 1.77 1.81 1.91 1.94 1.96 \ 1.99 2.06 2.09 2.10 2.14 2.15 2.23 2.24 2.26 2.35 2.37 2.40 \ 2.47 2.54 2.62 2.64 2.90 2.92 2.92 2.93 3.21 3.26 3.30 3.59 \ 3.68 4.30 4.64 5.34 5.42 6.01".split())) # Apply the generalized ESD r = pyasl.generalizedESD(x, 10, 0.05, fullOutput=True) print "Number of outliers: ", r[0] print "Indices of outliers: ", r[1] print " R Lambda" for i in range(len(r[2])): print "%2d %8.5f %8.5f" % ((i+1), r[2][i], r[3][i]) # Plot the "data" plt.plot(x, 'b.') # and mark the outliers. for i in range(r[0]): plt.plot(r[1][i], x[r[1][i]], 'rp')
def outliers_finder(llist): number_of_outliers = len(llist) / 4 if len(llist) >= 4 else 1 r = pyasl.generalizedESD(numpy.array(llist), number_of_outliers, 0.45, fullOutput=True) return [llist[i] for i in r[1]]
def res_gesd(res, maxOLs = 15, alpha = 0.05): ''' use loess curve residuals and generalized extreme studentized deviation test to identify outliers Parameters ---------- res: list time series of residuals from loess curve fit maxOLs: int max number of outliers to identify using GESD alpha: decimal signifigance level for identifying outlier Returns ------- list Binary series with same length as input TS. A value of 1 indicates the corresponding value in TS is an outlier ''' # index values of outliers outlier_index = pyasl.generalizedESD(res, maxOLs = maxOLs, alpha = alpha, fullOutput=False)[1] # data frame with index values df = pd.DataFrame({'index': list(range(0,len(res)))}) # create a col to indicate that the index value is an outlier df['gesd'] = np.where(df.index.isin(outlier_index), 1, 0) return(df.gesd.tolist())
def gesdFusion(sensor, maxOLs=6): for column in sensor: gesd = pyasl.generalizedESD(sensor[column], maxOLs) for outlier in gesd[1]: sensor[column][outlier] = np.NaN return basicFusion(sensor, 'mean')
def analyze(self, feature_name): # run outlier analysis, and get first 100 outlier data points r = pyasl.generalizedESD(self.df_variable[feature_name], 100, 0.1, fullOutput=True) print('Outlier Analysis for : ' + feature_name) print("Number of outliers: ", r[0]) print("Indices of outliers: ", r[1]) # outlier data points for i in range(len(r[1])): print("%8.5f " % self.df_variable[feature_name][r[1][i]])
def detect_anom_local(self, x, plot=False): rate = 0.01 decomp = stl(x, np=self.period, no=self.no, nt=self.nt) # STL decomposition seasonal = decomp['seasonal'] trend = decomp['trend'] # pick anomaly median = np.median(x) residual = x - seasonal - trend ret = pyasl.generalizedESD(residual, int(x.shape[0] * self.esd_rate)) anom_ind = ret[1] anom_val = np.array([x[k] for k in anom_ind]) if plot is True: plot_verticle([x], anom_ind, anom_val) return anom_ind
def outlier_detector(data): if kurtosis(data, fisher=False) < 3: # minimize masking effective vals, val_idx = hampel(data) to_return = (vals, val_idx) else: # k >3 # minimize swamping effect # apply Rosner filter for outlier detection based on (Extreme Studentized Deviate) ESD test r = pyasl.generalizedESD( data, maxOLs=10, alpha=0.05, fullOutput=True) # ptasl.pointDistGESD(data,5) vals = r[0] val_idx = r[1] to_return = (vals, val_idx) return to_return
def agencyAnomalyByStartday(df_collection_SD, days): ESD_output_by_start_day = [] for day_index, start_time_df in enumerate(df_collection_SD): df = start_time_df[-1] # get residual df output = {} for col_index, agency in enumerate(df.columns): esd_out = generalizedESD(df[agency].fillna(0).values, 12, alpha=0.1, fullOutput=False) output[agency] = list(sorted(esd_out[1])) ESD_output_by_start_day.append((days[day_index], output)) return ESD_output_by_start_day
def detect_anom_local(self, x, plot=False): assert x.shape[0] % self.period == 0 X = x.reshape([self.period, x.shape[0] / self.period]) # rpca lamb_base = max(x.shape) ** -0.5 L, S = robust_pca(X, lamb=lamb_base * self.lamb_rate) L = L.reshape([x.shape[0]]) S = S.reshape([x.shape[0]]) # select anomaly ret = pyasl.generalizedESD(S, int(x.shape[0] * self.esd_rate)) anom_ind = ret[1] anom_val = np.array([x[k] for k in anom_ind]) if plot is True: plot_verticle([x, L, S], anom_ind, anom_val) return anom_ind
def detect_anomalies(ary, freq, p): """Use Seasonal Decompose and ESD on residual to detect anomalies.""" if len(ary) < freq * 2: logger.warn( 'Only {} events for frequency of {}. Must be at least {} events.'. format(len(ary), freq, freq * 2)) logger.warn('Generate empty response.') return [], 0 # Seasonal decompose model = sm.tsa.seasonal_decompose(ary, freq=freq) # We only use residue values here resid = model.resid # Count leading Nan, which are going to be dropped for ESD # but have to be taken into account for the results dropped = 0 for val in resid: if np.isnan(val): dropped += 1 else: break # Remove NaNs, as ESD doesn't handle them well resid_cleaned = [x for x in resid if not np.isnan(x)] # Use ESD to detect anomalies anomalies = pyasl.generalizedESD(resid_cleaned, 20, p) # Get the indexed of the anomalies (index of resid_cleaned + dropped) idx_anoms = [x + dropped for x in anomalies[1]] # Convert np.int64 to standard int to make it serializable in json idx_anoms = [x.item() for x in idx_anoms] return idx_anoms, len(resid_cleaned)
def getOutliersESD(data, column): UniColumn = data[column] UniColumn = UniColumn.fillna(value=UniColumn.mean()) UniColumn = pd.DataFrame(UniColumn, columns=[UniColumn.name]) r = pyasl.generalizedESD(UniColumn[column], 100, fullOutput=True) return r[1]
collection_SD = [ ] # reforma collection with normalization and seasonal decomposition for df in collection_normalized: (seasonal, trend, residual) = seasonal_decompose_df(df) collection_SD.append((seasonal, trend, residual)) #---- Apply G-ESD ----# ESD_output_by_start_day = [] for day_index, start_time_df in enumerate(collection_SD): df = start_time_df[-1] # get residual df output = {} for col_index, agency in enumerate(df.columns): esd_out = generalizedESD(df[agency].fillna(0).values, 12, alpha=0.1, fullOutput=False) output[agency] = list(sorted(esd_out[1])) ESD_output_by_start_day.append((days[day_index], output)) #ESD_output_collection has a dict for each of the seven start days, each dict contains the anomalies detected for each agency ESD_output_by_agency = [] for agency in agency_names: agency_data = [agency] anomaly_dict = {} for index, ESD in enumerate(ESD_output_by_start_day): anomaly_dict[days[index]] = list(sorted(ESD[1][agency])) agency_data.append(anomaly_dict) ESD_output_by_agency.append(agency_data)
def rmOutliers(myfile, chrName, method, pvalue, miniSize, path): pvalue = float(pvalue) miniSize = int(miniSize) path = os.path.abspath(path) if not os.path.isdir(path): print( '\nSomething is wrong with your output directory! Please check!\n') sys.exit(1) fd = pd.read_csv(myfile, sep='\t', names=['chr', 'start', 'end', 'length']) if chrName != 'all': sub = fd[fd['chr'] == chrName].reset_index(drop=True) else: sub = fd if len(sub) < 1: print( '\nOpps! It seems that there is no SV in the input chromosome or the input chrName does not match with the chr in the file!\n' ) sys.exit(1) x = np.array([log(float(x)) for x in sub['length']]) Q3 = np.percentile(x, 75) Q1 = np.percentile(x, 25) upBound = 2.5 * Q3 - 1.5 * Q1 lowBound = 2.5 * Q1 - 1.5 * Q3 sum = 0 for j in range(len(sub)): if x[j] < lowBound: sum += 1 if x[j] > upBound: sum += 1 #print (sum) #plt.boxplot(x) # Apply the generalized ESD if sum < 2: r = pyasl.generalizedESD(x, 2, pvalue, fullOutput=True) else: r = pyasl.generalizedESD(x, sum, pvalue, fullOutput=True) print("The checked chromosome is: ", chrName) print("Number of outliers found by Tukey's method: ", sum) print("Number of outliers found by generalized ESD: ", r[0]) # print("Indices of outliers: ", r[1]) # Plot the "data" # plt.plot(x, 'b.') # and mark the outliers. # for i in range(r[0]): # plt.plot(r[1][i], x[r[1][i]], 'rp') # plt.show() if r[0] > 0: smallest = exp(x[r[1][-1]]) print("The samllest outlier is ", smallest) with open('%s/%s.%s.rmOutliers.txt' % (path, chrName, method), 'w') as fh: for j in range(len(sub)): if smallest > lowBound: if sub['length'][j] < smallest and sub['length'][ j] >= miniSize: fh.write('%s\t%s\t%s\t%s\n' % (sub['chr'][j], sub['start'][j], sub['end'][j], sub['length'][j])) else: if sub['length'][j] >= miniSize: fh.write('%s\t%s\t%s\t%s\n' % (sub['chr'][j], sub['start'][j], sub['end'][j], sub['length'][j])) else: with open('%s/%s.%s.rmOutliers.txt' % (path, chrName, method), 'w') as fh: for j in range(len(sub)): if sub['length'][j] >= miniSize: fh.write('%s\t%s\t%s\t%s\n' % (sub['chr'][j], sub['start'][j], sub['end'][j], sub['length'][j]))
def check_flights(trvl_dest, dept, rtn): # replace the path for the web driver # might need to download a more updated version of the chromedriver chrome_path = 'C:\\Users\\stark\\Downloads\\chromedriver' browser = webdriver.Chrome(chrome_path) #start_date = dept #rtn_date = rtn # initialize lists to store retrieved information dept_date, rtn_date, destination, price, stops, flttime = [], [], [], [], [], [] start_sat_date = datetime.strptime(dept, '%Y-%m-%d') end_sat_date = datetime.strptime(rtn, '%Y-%m-%d') for i in range(60): sat_start = str(start_sat_date).split()[0] sat_end = str(end_sat_date).split()[0] satz = "https://www.google.com/flights#f=0&flt=/m/0d6lp.r/m/02j9z." + sat_start + "*r/m/02j9z./m/0d6lp." + sat_end + ";c:USD;e:1;sd:1;t:e" #print(satz) browser.get(satz) # have the programl sleep for random time sleep(np.random.randint(3,7)) # call BeautifulSoup to extract the data from the html webpage soupit = BeautifulSoup(browser.page_source, "html5lib") cardz = soupit.select('div[class*=tsAU4e ]') #print(len(cardz)) for card in cardz: # Extract the data and appends to a list # Destination # Price of the Flight # Number of Stops # Duration of the Flight dest = card.select('h3[class*="W6bZuc YMlIz"]')[0].text prc = card.select('div[class*=MJg7fb]')[0].text stp = card.select('span[class*=nx0jzf]')[0].text flt = card.select('span[class*=Xq1DAb]')[0].text destination.append(dest) price.append(prc) # append the departure and return dates dept_date.append(start_sat_date.strftime('%Y-%m-%d')) rtn_date.append(end_sat_date.strftime('%Y-%m-%d')) stops.append(stp) flttime.append(flt) # update the departure and return dates start_sat_date = start_sat_date + timedelta(days=4) end_sat_date = end_sat_date + timedelta(days=4) # regex for grabbing number of stops re_stops = r'[0-9]{1}' # processing of the data to remove ',' # 'Great Value' # '$'-sign prices = [price[i].replace(',', '') for i in range(len(price))] prices1 = [prices[i].replace('Great value', '') for i in range(len(prices))] prices2 = [int(prices1[i].replace('$', '')) for i in range(len(prices))] stops1 = [re.findall(re_stops, stops[i]) for i in range(len(stops))] stops_flat = [item for sublist in stops1 for item in sublist] df_flts = pd.DataFrame(list(zip(dept_date, rtn_date, destination, prices2, stops_flat, flttime)), columns =['DepartureDate', 'ReturnDate', 'Destination', 'PriceUSD', 'Stops', 'TravelTime']) my_city = trvl_dest # sub-select dataframe for destination of interest temp = df_flts.query("Destination=='{0}'".format(my_city)) # generalizedESD for outlier determination r = pyasl.generalizedESD(temp['PriceUSD'], 3, 0.025, fullOutput=True) print('Total Outliers:', r[0]) out_dates = [] for i in sorted(r[1]): out_dates.append(temp['DepartureDate'][i]) print(out_dates) print(' R Lambda') for i in range(len(r[2])): print('%2d %8.5f %8.5f' % ((i+1), r[2][i], r[3][i])) # find the minimum price of the dataframe my_min = min(temp['PriceUSD']) # subselect destination dataframe with minimum price df_lowest = temp.query("PriceUSD=='{0}'".format(my_min)) # save all instances of the cheap flight to a list of strings my_strings = [] for i in range(len(df_lowest)): ugh = "To " + trvl_dest + " on " + df_lowest['DepartureDate'].iloc[i] + " for $" + str(df_lowest['PriceUSD'].iloc[i]) my_strings.append(ugh) # concentate strings together to send as text message new = '\n'.join(my_strings) # Establish a secure session with gmail's outgoing SMTP server using your gmail account server = smtplib.SMTP("smtp.gmail.com", 587) # log into server with credentials server.starttls() server.login( 'Your Email', "Your Password" ) # save the message message = "ALERT!!!" + "\n" + new # Send text message through SMS gateway of destination number server.sendmail( 'Flight Updater', '##########@mms.att.net', message)
index=df_outlier.column_yearmonth) df_outlier = df_outlier.fillna(0) #baseline model df_zscore = Zscore(df_outlier) #for using grubb model num_outlier = len(df_zscore) if num_outlier == 0: continue elif num_outlier == 1: num_outlier = 2 else: num_outlier = len(df_zscore) + 1 out_Grubb = pyasl.generalizedESD(df_outlier, num_outlier, alpha=0.05) out_Grubb[1].sort() while 0 in out_Grubb[1]: out_Grubb[1].remove(0) while 1 in out_Grubb[1]: out_Grubb[1].remove(1) #taking data only where there is an outlier if len(out_Grubb[1]) == 0: continue else: #collecting data till first outlier detected Grubb_list = out_Grubb[1] first_outlier = df_outlier[0:Grubb_list[0]]
import pandas as pd import numpy as np import matplotlib.pyplot as plt from PyAstronomy import pyasl #read the dataframe; 'All Subject Areas' is the name of the worksheet in the template df = pd.read_excel('nameOfYourFile.xlsx', sheetname='All Subject Areas') #make a dataframe based on topic--'Subject Area' is the column name in the template dfSubject = df[df['Subject Area'] == 'A subject area'] #make an array of the FWCIs--'FWCI' is the column name in the template dfSubjectFwci = dfSubject[['FWCI']].as_matrix() #make an array of the publications--'Scholarly Output" is the column name in the template dfSubjectPubs = dfSubject[['Scholarly Output']].as_matrix() #use pyasl to identify outliers in FWCI r = pyasl.generalizedESD(df_bdP, 10, 0.05, fullOutput=True) print('Number of outliers: ', r[0]) print('indices of outliers: ', r[1]) #get the median of the publications np.median(dfSubjectPubs)
for i in lengthsi: if i < loweri or i > upperi: branches += 1 for line in T.get_nonterminals(): if i == line.branch_length: tmp = list() for l in line.get_terminals(): tmp.append(l.name) if len(tmp) < args.tipsMax: internals = internals + tmp else: branches -= 1 # gESD if args.method == "gesd": gesd = pyasl.generalizedESD(lengths, m, t) for j in gesd[1]: toPrune.append(tips[j]) if args.internal: internals = list() gesdi = pyasl.generalizedESD(lengthsi, mi, ti) for j in gesdi[1]: i = lengthsi[j] branches += 1 for line in T.get_nonterminals(): if i == line.branch_length: tmp = list() for l in line.get_terminals(): tmp.append(l.name) if len(tmp) < args.tipsMax: internals = internals + tmp