Beispiel #1
0
def detect_anomalies(ary, freq, p):
    """Use Seasonal Decompose and ESD on residual to detect anomalies."""
    if len(ary) < freq * 2:
        logger.warn('Only {} events for frequency of {}. Must be at least {} events.'.format(len(ary), freq, freq * 2))
        logger.warn('Generate empty response.')
        return [], 0

    # Seasonal decompose
    model = sm.tsa.seasonal_decompose(ary, freq=freq)

    # We only use residue values here
    resid = model.resid

    # Count leading Nan, which are going to be dropped for ESD
    # but have to be taken into account for the results
    dropped = 0
    for val in resid:
        if np.isnan(val):
            dropped += 1
        else:
            break

    # Remove NaNs, as ESD doesn't handle them well
    resid_cleaned = [x for x in resid if not np.isnan(x)]

    # Use ESD to detect anomalies
    anomalies = pyasl.generalizedESD(resid_cleaned, 20, p)

    # Get the indexed of the anomalies (index of resid_cleaned + dropped)
    idx_anoms = [x + dropped for x in anomalies[1]]

    # Convert np.int64 to standard int to make it serializable in json
    idx_anoms = [x.item() for x in idx_anoms]

    return idx_anoms, len(resid_cleaned)
Beispiel #2
0
 def sanity_ESDexample(self):
   """
     Check the example for the generalized ESD
   """
   import numpy as np
   import matplotlib.pylab as plt
   from PyAstronomy import pyasl
   
   # Convert data given at:
   # http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm
   # to array.
   x = np.array(map(lambda x: float(x),
           "-0.25 0.68 0.94 1.15 1.20 1.26 1.26 1.34 1.38 1.43 1.49 1.49 \
             1.55 1.56 1.58 1.65 1.69 1.70 1.76 1.77 1.81 1.91 1.94 1.96 \
             1.99 2.06 2.09 2.10 2.14 2.15 2.23 2.24 2.26 2.35 2.37 2.40 \
             2.47 2.54 2.62 2.64 2.90 2.92 2.92 2.93 3.21 3.26 3.30 3.59 \
             3.68 4.30 4.64 5.34 5.42 6.01".split()))
   
   # Apply the generalized ESD
   r = pyasl.generalizedESD(x, 10, 0.05, fullOutput=True)
   
   print "Number of outliers: ", r[0]
   print "Indices of outliers: ", r[1]
   print "        R      Lambda"
   for i in range(len(r[2])):
     print "%2d  %8.5f  %8.5f" % ((i+1), r[2][i], r[3][i])
   
   # Plot the "data"
   plt.plot(x, 'b.')
   # and mark the outliers.
   for i in range(r[0]):
     plt.plot(r[1][i], x[r[1][i]], 'rp')
def outliers_finder(llist):
    number_of_outliers = len(llist) / 4 if len(llist) >= 4 else 1
    r = pyasl.generalizedESD(numpy.array(llist),
                             number_of_outliers,
                             0.45,
                             fullOutput=True)
    return [llist[i] for i in r[1]]
Beispiel #4
0
def res_gesd(res, maxOLs = 15, alpha = 0.05):
    '''
    use loess curve residuals and generalized extreme studentized deviation
    test to identify outliers
    
    Parameters
    ----------
    res: list
        time series of residuals from loess curve fit
        
    maxOLs: int
        max number of outliers to identify using GESD
        
    alpha: decimal
        signifigance level for identifying outlier
         
    Returns
    -------
    list
        Binary series with same length as input TS. A value of 1 indicates the
        corresponding value in TS is an outlier          
    '''
    
    # index values of outliers
    outlier_index = pyasl.generalizedESD(res, maxOLs = maxOLs, alpha = alpha,
                                         fullOutput=False)[1]
    
    # data frame with index values 
    df = pd.DataFrame({'index': list(range(0,len(res)))})
    # create a col to indicate that the index value is an outlier
    df['gesd'] = np.where(df.index.isin(outlier_index), 1, 0)

    return(df.gesd.tolist())
Beispiel #5
0
def gesdFusion(sensor, maxOLs=6):
    for column in sensor:
        gesd = pyasl.generalizedESD(sensor[column], maxOLs)

        for outlier in gesd[1]:
            sensor[column][outlier] = np.NaN

    return basicFusion(sensor, 'mean')
    def analyze(self, feature_name):
        # run outlier analysis, and get first 100 outlier data points
        r = pyasl.generalizedESD(self.df_variable[feature_name],
                                 100,
                                 0.1,
                                 fullOutput=True)
        print('Outlier Analysis for : ' + feature_name)
        print("Number of outliers: ", r[0])
        print("Indices of outliers: ", r[1])

        # outlier data points
        for i in range(len(r[1])):
            print("%8.5f  " % self.df_variable[feature_name][r[1][i]])
Beispiel #7
0
    def detect_anom_local(self, x, plot=False):
        rate = 0.01
        decomp = stl(x, np=self.period, no=self.no, nt=self.nt)          # STL decomposition
        seasonal = decomp['seasonal']
        trend = decomp['trend']

        # pick anomaly
        median = np.median(x)
        residual = x - seasonal - trend
        ret = pyasl.generalizedESD(residual, int(x.shape[0] * self.esd_rate))
        anom_ind = ret[1]
        anom_val = np.array([x[k] for k in anom_ind])
        if plot is True:
            plot_verticle([x], anom_ind, anom_val)
        return anom_ind
Beispiel #8
0
def outlier_detector(data):

    if kurtosis(data, fisher=False) < 3:  # minimize masking effective
        vals, val_idx = hampel(data)
        to_return = (vals, val_idx)

    else:  # k >3 # minimize swamping effect
        # apply Rosner filter for outlier detection based on (Extreme Studentized Deviate) ESD test
        r = pyasl.generalizedESD(
            data, maxOLs=10, alpha=0.05,
            fullOutput=True)  # ptasl.pointDistGESD(data,5)
        vals = r[0]
        val_idx = r[1]
        to_return = (vals, val_idx)

    return to_return
def agencyAnomalyByStartday(df_collection_SD, days):

    ESD_output_by_start_day = []
    for day_index, start_time_df in enumerate(df_collection_SD):
        df = start_time_df[-1]  # get residual df

        output = {}
        for col_index, agency in enumerate(df.columns):
            esd_out = generalizedESD(df[agency].fillna(0).values,
                                     12,
                                     alpha=0.1,
                                     fullOutput=False)
            output[agency] = list(sorted(esd_out[1]))

        ESD_output_by_start_day.append((days[day_index], output))

    return ESD_output_by_start_day
Beispiel #10
0
    def detect_anom_local(self, x, plot=False):
        assert x.shape[0] % self.period == 0

        X = x.reshape([self.period, x.shape[0] / self.period])

        # rpca
        lamb_base = max(x.shape) ** -0.5
        L, S = robust_pca(X, lamb=lamb_base * self.lamb_rate)
        L = L.reshape([x.shape[0]])
        S = S.reshape([x.shape[0]])

        # select anomaly
        ret = pyasl.generalizedESD(S, int(x.shape[0] * self.esd_rate))
        anom_ind = ret[1]
        anom_val = np.array([x[k] for k in anom_ind])

        if plot is True:
            plot_verticle([x, L, S], anom_ind, anom_val)

        return anom_ind
Beispiel #11
0
def detect_anomalies(ary, freq, p):
    """Use Seasonal Decompose and ESD on residual to detect anomalies."""
    if len(ary) < freq * 2:
        logger.warn(
            'Only {} events for frequency of {}. Must be at least {} events.'.
            format(len(ary), freq, freq * 2))
        logger.warn('Generate empty response.')
        return [], 0

    # Seasonal decompose
    model = sm.tsa.seasonal_decompose(ary, freq=freq)

    # We only use residue values here
    resid = model.resid

    # Count leading Nan, which are going to be dropped for ESD
    # but have to be taken into account for the results
    dropped = 0
    for val in resid:
        if np.isnan(val):
            dropped += 1
        else:
            break

    # Remove NaNs, as ESD doesn't handle them well
    resid_cleaned = [x for x in resid if not np.isnan(x)]

    # Use ESD to detect anomalies
    anomalies = pyasl.generalizedESD(resid_cleaned, 20, p)

    # Get the indexed of the anomalies (index of resid_cleaned + dropped)
    idx_anoms = [x + dropped for x in anomalies[1]]

    # Convert np.int64 to standard int to make it serializable in json
    idx_anoms = [x.item() for x in idx_anoms]

    return idx_anoms, len(resid_cleaned)
Beispiel #12
0
def getOutliersESD(data, column):
    UniColumn = data[column]
    UniColumn = UniColumn.fillna(value=UniColumn.mean())
    UniColumn = pd.DataFrame(UniColumn, columns=[UniColumn.name])
    r = pyasl.generalizedESD(UniColumn[column], 100, fullOutput=True)
    return r[1]
collection_SD = [
]  # reforma collection with normalization and seasonal decomposition
for df in collection_normalized:
    (seasonal, trend, residual) = seasonal_decompose_df(df)
    collection_SD.append((seasonal, trend, residual))

#---- Apply G-ESD ----#

ESD_output_by_start_day = []
for day_index, start_time_df in enumerate(collection_SD):
    df = start_time_df[-1]  # get residual df

    output = {}
    for col_index, agency in enumerate(df.columns):
        esd_out = generalizedESD(df[agency].fillna(0).values,
                                 12,
                                 alpha=0.1,
                                 fullOutput=False)
        output[agency] = list(sorted(esd_out[1]))

    ESD_output_by_start_day.append((days[day_index], output))

#ESD_output_collection has a dict for each of the seven start days, each dict contains the anomalies detected for each agency

ESD_output_by_agency = []
for agency in agency_names:
    agency_data = [agency]
    anomaly_dict = {}
    for index, ESD in enumerate(ESD_output_by_start_day):
        anomaly_dict[days[index]] = list(sorted(ESD[1][agency]))
    agency_data.append(anomaly_dict)
    ESD_output_by_agency.append(agency_data)
Beispiel #14
0
def rmOutliers(myfile, chrName, method, pvalue, miniSize, path):

    pvalue = float(pvalue)
    miniSize = int(miniSize)
    path = os.path.abspath(path)
    if not os.path.isdir(path):
        print(
            '\nSomething is wrong with your output directory! Please check!\n')
        sys.exit(1)

    fd = pd.read_csv(myfile, sep='\t', names=['chr', 'start', 'end', 'length'])

    if chrName != 'all':
        sub = fd[fd['chr'] == chrName].reset_index(drop=True)
    else:
        sub = fd
    if len(sub) < 1:
        print(
            '\nOpps! It seems that there is no SV in the input chromosome or the input chrName does not match with the chr in the file!\n'
        )
        sys.exit(1)

    x = np.array([log(float(x)) for x in sub['length']])

    Q3 = np.percentile(x, 75)
    Q1 = np.percentile(x, 25)
    upBound = 2.5 * Q3 - 1.5 * Q1
    lowBound = 2.5 * Q1 - 1.5 * Q3
    sum = 0
    for j in range(len(sub)):
        if x[j] < lowBound:
            sum += 1
        if x[j] > upBound:
            sum += 1
    #print (sum)

    #plt.boxplot(x)
    # Apply the generalized ESD
    if sum < 2:
        r = pyasl.generalizedESD(x, 2, pvalue, fullOutput=True)
    else:
        r = pyasl.generalizedESD(x, sum, pvalue, fullOutput=True)
    print("The checked chromosome is: ", chrName)
    print("Number of outliers found by Tukey's method: ", sum)
    print("Number of outliers found by generalized ESD: ", r[0])
    # print("Indices of outliers: ", r[1])
    # Plot the "data"
    # plt.plot(x, 'b.')

    # and mark the outliers.
    # for i in range(r[0]):
    #    plt.plot(r[1][i], x[r[1][i]], 'rp')
    # plt.show()
    if r[0] > 0:
        smallest = exp(x[r[1][-1]])

        print("The samllest outlier is ", smallest)

        with open('%s/%s.%s.rmOutliers.txt' % (path, chrName, method),
                  'w') as fh:
            for j in range(len(sub)):
                if smallest > lowBound:
                    if sub['length'][j] < smallest and sub['length'][
                            j] >= miniSize:
                        fh.write('%s\t%s\t%s\t%s\n' %
                                 (sub['chr'][j], sub['start'][j],
                                  sub['end'][j], sub['length'][j]))
                else:
                    if sub['length'][j] >= miniSize:
                        fh.write('%s\t%s\t%s\t%s\n' %
                                 (sub['chr'][j], sub['start'][j],
                                  sub['end'][j], sub['length'][j]))
    else:
        with open('%s/%s.%s.rmOutliers.txt' % (path, chrName, method),
                  'w') as fh:
            for j in range(len(sub)):
                if sub['length'][j] >= miniSize:
                    fh.write('%s\t%s\t%s\t%s\n' %
                             (sub['chr'][j], sub['start'][j], sub['end'][j],
                              sub['length'][j]))
def check_flights(trvl_dest, dept, rtn):
    # replace the path for the web driver
    # might need to download a more updated version of the chromedriver
    chrome_path = 'C:\\Users\\stark\\Downloads\\chromedriver'
    browser     = webdriver.Chrome(chrome_path)
    
    #start_date = dept
    #rtn_date   = rtn
    
    # initialize lists to store retrieved information
    dept_date, rtn_date, destination, price, stops, flttime = [], [], [], [], [], []
    
    start_sat_date = datetime.strptime(dept, '%Y-%m-%d')
    end_sat_date = datetime.strptime(rtn, '%Y-%m-%d')
    
    for i in range(60):
        sat_start = str(start_sat_date).split()[0]
        sat_end   = str(end_sat_date).split()[0]
        satz = "https://www.google.com/flights#f=0&flt=/m/0d6lp.r/m/02j9z." + sat_start + "*r/m/02j9z./m/0d6lp." + sat_end + ";c:USD;e:1;sd:1;t:e"
    
        #print(satz)
        browser.get(satz)
        # have the programl sleep for random time
        sleep(np.random.randint(3,7))
        # call BeautifulSoup to extract the data from the html webpage
        soupit = BeautifulSoup(browser.page_source, "html5lib")
        cardz = soupit.select('div[class*=tsAU4e ]')
        #print(len(cardz))
        for card in cardz:
            # Extract the data and appends to a list
            # Destination
            # Price of the Flight
            # Number of Stops
            # Duration of the Flight
            dest = card.select('h3[class*="W6bZuc YMlIz"]')[0].text
            prc  = card.select('div[class*=MJg7fb]')[0].text
            stp = card.select('span[class*=nx0jzf]')[0].text
            flt = card.select('span[class*=Xq1DAb]')[0].text
            destination.append(dest)
            price.append(prc)
            # append the departure and return dates
            dept_date.append(start_sat_date.strftime('%Y-%m-%d'))
            rtn_date.append(end_sat_date.strftime('%Y-%m-%d'))
            stops.append(stp)
            flttime.append(flt)
        
    
        # update the departure and return dates
        start_sat_date = start_sat_date + timedelta(days=4)
        end_sat_date   = end_sat_date + timedelta(days=4)
    
    # regex for grabbing number of stops
    re_stops = r'[0-9]{1}'
    
    # processing of the data to remove ','
    # 'Great Value'
    # '$'-sign
    prices = [price[i].replace(',', '') for i in range(len(price))]
    prices1 = [prices[i].replace('Great value', '') for i in range(len(prices))]
    prices2 = [int(prices1[i].replace('$', '')) for i in range(len(prices))]
    stops1 =  [re.findall(re_stops, stops[i]) for i in range(len(stops))]
    stops_flat = [item for sublist in stops1 for item in sublist]
    df_flts = pd.DataFrame(list(zip(dept_date, rtn_date, destination, prices2, stops_flat, flttime)), columns =['DepartureDate', 'ReturnDate', 'Destination', 'PriceUSD', 'Stops', 'TravelTime'])
    
    my_city = trvl_dest
    # sub-select dataframe for destination of interest
    temp = df_flts.query("Destination=='{0}'".format(my_city))
    
    # generalizedESD for outlier determination
    r = pyasl.generalizedESD(temp['PriceUSD'], 3, 0.025, fullOutput=True)
    print('Total Outliers:', r[0])
    out_dates = []
    for i in sorted(r[1]):
        out_dates.append(temp['DepartureDate'][i])
        print(out_dates)
        print('       R      Lambda')
    for i in range(len(r[2])):
        print('%2d %8.5f %8.5f' % ((i+1), r[2][i], r[3][i]))
    
    # find the minimum price of the dataframe
    my_min = min(temp['PriceUSD'])
    # subselect destination dataframe with minimum price
    df_lowest = temp.query("PriceUSD=='{0}'".format(my_min))
    # save all instances of the cheap flight to a list of strings
    my_strings = []
    for i in range(len(df_lowest)):
        ugh = "To " + trvl_dest + " on " + df_lowest['DepartureDate'].iloc[i] + " for $" + str(df_lowest['PriceUSD'].iloc[i])
        my_strings.append(ugh)
    # concentate strings together to send as text message
    new = '\n'.join(my_strings)
    
    # Establish a secure session with gmail's outgoing SMTP server using your gmail account
    server = smtplib.SMTP("smtp.gmail.com", 587)
    
    # log into server with credentials
    server.starttls()
    server.login( 'Your Email', "Your Password" )
    
    # save the message
    message = "ALERT!!!" + "\n" + new

    # Send text message through SMS gateway of destination number
    server.sendmail( 'Flight Updater', '##########@mms.att.net', message)
    
    
        
    
    
                                   index=df_outlier.column_yearmonth)
            df_outlier = df_outlier.fillna(0)

            #baseline model
            df_zscore = Zscore(df_outlier)
            #for using grubb model
            num_outlier = len(df_zscore)
            if num_outlier == 0:
                continue
            elif num_outlier == 1:
                num_outlier = 2
            else:
                num_outlier = len(df_zscore) + 1

            out_Grubb = pyasl.generalizedESD(df_outlier,
                                             num_outlier,
                                             alpha=0.05)
            out_Grubb[1].sort()
            while 0 in out_Grubb[1]:
                out_Grubb[1].remove(0)
            while 1 in out_Grubb[1]:
                out_Grubb[1].remove(1)

            #taking data only where there is an outlier
            if len(out_Grubb[1]) == 0:
                continue
            else:

                #collecting data till first outlier detected
                Grubb_list = out_Grubb[1]
                first_outlier = df_outlier[0:Grubb_list[0]]
Beispiel #17
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PyAstronomy import pyasl

#read the dataframe; 'All Subject Areas' is the name of the worksheet in the template
df = pd.read_excel('nameOfYourFile.xlsx', sheetname='All Subject Areas')

#make a dataframe based on topic--'Subject Area' is the column name in the template
dfSubject = df[df['Subject Area'] == 'A subject area']

#make an array of the FWCIs--'FWCI' is the column name in the template
dfSubjectFwci = dfSubject[['FWCI']].as_matrix()

#make an array of the publications--'Scholarly Output" is the column name in the template
dfSubjectPubs = dfSubject[['Scholarly Output']].as_matrix()

#use pyasl to identify outliers in FWCI
r = pyasl.generalizedESD(df_bdP, 10, 0.05, fullOutput=True)
print('Number of outliers: ', r[0])
print('indices of outliers: ', r[1])

#get the median of the publications
np.median(dfSubjectPubs)
        for i in lengthsi:
            if i < loweri or i > upperi:
                branches += 1
                for line in T.get_nonterminals():
                    if i == line.branch_length:
                        tmp = list()
                        for l in line.get_terminals():
                            tmp.append(l.name)
                        if len(tmp) < args.tipsMax:
                            internals = internals + tmp
                        else:
                            branches -= 1

# gESD
if args.method == "gesd":
    gesd = pyasl.generalizedESD(lengths, m, t)
    for j in gesd[1]:
        toPrune.append(tips[j])
    if args.internal:
        internals = list()
        gesdi = pyasl.generalizedESD(lengthsi, mi, ti)
        for j in gesdi[1]:
            i = lengthsi[j]
            branches += 1
            for line in T.get_nonterminals():
                if i == line.branch_length:
                    tmp = list()
                    for l in line.get_terminals():
                        tmp.append(l.name)
                    if len(tmp) < args.tipsMax:
                        internals = internals + tmp