def transform(self, temperatures_xray, n_burn_in, n_lookahead, skf_is):
     """Use world temps as features."""
     # Set all temps on world map as features
     all_temps = temperatures_xray['tas'].values
     time_steps, lats, lons = all_temps.shape
     all_temps = all_temps.reshape((time_steps,lats*lons))
     wC = 15
     rolling_std = pd.rolling_std(pd.DataFrame(all_temps), window=wC, min_periods=1).values
     rolling_std = rolling_std[n_burn_in:-n_lookahead,:]
     rolling_quantileHigh = pd.rolling_quantile(pd.DataFrame(all_temps), window=wC, min_periods=1, quantile=0.99).values
     rolling_quantileHigh = rolling_quantileHigh[n_burn_in:-n_lookahead,:]
     rolling_quantileLow = pd.rolling_quantile(pd.DataFrame(all_temps), window=wC, min_periods=1, quantile=0.01).values
     rolling_quantileLow = rolling_quantileLow[n_burn_in:-n_lookahead,:]
     all_temps = all_temps[n_burn_in:-n_lookahead,:]
     return np.hstack((all_temps, rolling_std, rolling_quantileHigh, rolling_quantileLow))
Example #2
0
def moving_trim_avg(df, col, t='600s', percentile=0.1):
    df.sort_values(by=['dataset_location', 'dataset_datetime'], inplace=True)
    df.index = df['dataset_datetime']

    features_delta = []
    for col_ in col:
        func_percentile = lambda x: pd.rolling_quantile(
            x, window=t, quantile=percentile, min_periods=0)
        df['quantile'] = df.groupby('dataset_location')[col_].apply(
            func_percentile)
        df['check_lowerbound'] = np.where(df[col_] > df['quantile'], df[col_],
                                          float('nan'))
        df['check_lowerbound'] = np.where(df[col_] <= df['quantile'],
                                          df['quantile'],
                                          df['check_lowerbound'])
        colname = col_ + '_moving_avg'
        func_mean = lambda x: pd.rolling_mean(x, window=t, min_periods=0)
        df[colname] = df.groupby('dataset_location')['check_lowerbound'].apply(
            func_mean)
        colname1 = col_ + '_change'
        df[colname1] = df[col_] - df[colname]
        features_delta.append(colname1)

        del df['check_lowerbound']
        del df['quantile']
    df.index = range(len(df))
    return df, features_delta
Example #3
0
def rolling_quantile(x, width, quantile):
    """Rolling quantile (0--1) with mirrored edges."""
    x, wing = check_inputs(x, width)
    # Pad the edges of the original array with mirror copies
    signal = np.concatenate((x[wing-1::-1], x, x[:-wing-1:-1]))
    rolled = pd.rolling_quantile(signal, 2 * wing + 1, quantile, center=True)
    return rolled[wing:-wing]
Example #4
0
def robust_vol_calc(x, days=35, min_periods=10, vol_abs_min=0.0000000001, vol_floor=True,
                    floor_min_quant=0.05, floor_min_periods=100,
                    floor_days=500):

    # Standard deviation will be nan for first 10 non nan values
    vol = pd.ewmstd(x, span=days, min_periods=min_periods)

    vol[vol < vol_abs_min] = vol_abs_min

    if vol_floor:
        # Find the rolling 5% quantile point to set as a minimum
        vol_min = pd.rolling_quantile(
            vol, floor_days, floor_min_quant, floor_min_periods)
        # set this to zero for the first value then propogate forward, ensures
        # we always have a value
        vol_min.set_value(vol_min.index[0], 0.0)
        vol_min = vol_min.ffill()

        # apply the vol floor
        vol_with_min = pd.concat([vol, vol_min], axis=1)
        vol_floored = vol_with_min.max(axis=1, skipna=False)
    else:
        vol_floored = vol

    return vol_floored
Example #5
0
def rolling_quantile(x, width, quant):
    """Rolling quantile (0--1) with mirrored edges."""
    x, wing = check_inputs(x, width)
    # Pad the edges of the original array with mirror copies
    signal = np.concatenate((x[wing-1::-1], x, x[:-wing-1:-1]))
    rolled = pd.rolling_quantile(signal, 2 * wing + 1, quant, center=True)
    return rolled[wing:-wing]
def extract_features_group(df, columns, win_size):
    df_mean = df.groupby('id')[columns].apply(pd.rolling_mean,
                                              win_size,
                                              min_periods=1)
    df_std = df.groupby('id')[columns].apply(pd.rolling_std,
                                             win_size,
                                             min_periods=1)
    df_std = df_std.fillna(0)
    df_median = df.groupby('id')[columns].apply(pd.rolling_median,
                                                win_size,
                                                min_periods=1)
    df_min = df.groupby('id')[columns].apply(pd.rolling_min,
                                             win_size,
                                             min_periods=1)
    df_max = df.groupby('id')[columns].apply(pd.rolling_max,
                                             win_size,
                                             min_periods=1)
    df_quantile = df.groupby('id')[columns].apply(
        lambda x: pd.rolling_quantile(x, win_size, 0.9, min_periods=1))
    df_rms = df.groupby('id')[columns].apply(pd.rolling_apply,
                                             win_size,
                                             lambda x: RMS(x),
                                             min_periods=1)
    df_energy = df.groupby('id')[columns].apply(pd.rolling_apply,
                                                win_size,
                                                lambda x: Energy(x),
                                                min_periods=1)
    df_features = pd.concat([
        df[columns], df_mean, df_std, df_median, df_max, df_min, df_quantile,
        df_rms, df_energy
    ],
                            axis=1).dropna()
    features = np.array(df_features)
    return features
Example #7
0
def robust_vol_calc(x,
                    days=35,
                    min_periods=10,
                    vol_abs_min=0.0000000001,
                    vol_floor=True,
                    floor_min_quant=0.05,
                    floor_min_periods=100,
                    floor_days=500):

    # Standard deviation will be nan for first 10 non nan values
    vol = pd.ewmstd(x, span=days, min_periods=min_periods)

    vol[vol < vol_abs_min] = vol_abs_min

    if vol_floor:
        # Find the rolling 5% quantile point to set as a minimum
        vol_min = pd.rolling_quantile(vol, floor_days, floor_min_quant,
                                      floor_min_periods)
        # set this to zero for the first value then propogate forward, ensures
        # we always have a value
        vol_min.set_value(vol_min.index[0], 0.0)
        vol_min = vol_min.ffill()

        # apply the vol floor
        vol_with_min = pd.concat([vol, vol_min], axis=1)
        vol_floored = vol_with_min.max(axis=1, skipna=False)
    else:
        vol_floored = vol

    return vol_floored
Example #8
0
def rolling_functions_tests(p, d):
    # Old-fashioned rolling API
    assert_eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3))
    assert_eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3))
    assert_eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3))
    assert_eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3))
    assert_eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3))
    assert_eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3))
    assert_eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3))
    assert_eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3))
    # see note around test_rolling_dataframe for logic concerning precision
    assert_eq(pd.rolling_skew(p, 3),
              dd.rolling_skew(d, 3), check_less_precise=True)
    assert_eq(pd.rolling_kurt(p, 3),
              dd.rolling_kurt(d, 3), check_less_precise=True)
    assert_eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5))
    assert_eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad))
    with ignoring(ImportError):
        assert_eq(pd.rolling_window(p, 3, 'boxcar'),
                  dd.rolling_window(d, 3, 'boxcar'))
    # Test with edge-case window sizes
    assert_eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0))
    assert_eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1))
    # Test with kwargs
    assert_eq(pd.rolling_sum(p, 3, min_periods=3),
              dd.rolling_sum(d, 3, min_periods=3))
Example #9
0
File: algos.py Project: cymond/sysy
def robust_vol_calc(x,
                    days=35,
                    min_periods=10,
                    vol_abs_min=0.0000000001,
                    vol_floor=True,
                    floor_min_quant=0.05,
                    floor_min_periods=100,
                    floor_days=500):
    """
    Robust exponential volatility calculation, assuming daily series of prices
    We apply an absolute minimum level of vol (absmin);
    and a volfloor based on lowest vol over recent history

    :param x: data
    :type x: Tx1 pd.Series

    :param days: Number of days in lookback (*default* 35)
    :type days: int

    :param min_periods: The minimum number of observations (*default* 10)
    :type min_periods: int

    :param vol_abs_min: The size of absolute minimum (*default* =0.0000000001) 0.0= not used
    :type absmin: float or None

    :param vol_floor Apply a floor to volatility (*default* True)
    :type vol_floor: bool
    :param floor_min_quant: The quantile to use for volatility floor (eg 0.05 means we use 5% vol) (*default 0.05)
    :type floor_min_quant: float
    :param floor_days: The lookback for calculating volatility floor, in days (*default* 500)
    :type floor_days: int
    :param floor_min_periods: Minimum observations for floor - until reached floor is zero (*default* 100)
    :type floor_min_periods: int

    :returns: pd.DataFrame -- volatility measure


    """

    # Standard deviation will be nan for first 10 non nan values
    vol = pd.ewmstd(x, span=days, min_periods=min_periods)

    vol[vol < vol_abs_min] = vol_abs_min

    if vol_floor:
        # Find the rolling 5% quantile point to set as a minimum
        vol_min = pd.rolling_quantile(vol, floor_days, floor_min_quant,
                                      floor_min_periods)
        # set this to zero for the first value then propogate forward, ensures
        # we always have a value
        vol_min.set_value(vol_min.index[0], 0.0)
        vol_min = vol_min.ffill()

        # apply the vol floor
        vol_with_min = pd.concat([vol, vol_min], axis=1)
        vol_floored = vol_with_min.max(axis=1, skipna=False)
    else:
        vol_floored = vol

    return vol_floored
Example #10
0
def rolling_functions_tests(p, d):
    # Old-fashioned rolling API
    assert_eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3))
    assert_eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3))
    assert_eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3))
    assert_eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3))
    assert_eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3))
    assert_eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3))
    assert_eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3))
    assert_eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3))
    # see note around test_rolling_dataframe for logic concerning precision
    assert_eq(pd.rolling_skew(p, 3),
              dd.rolling_skew(d, 3),
              check_less_precise=True)
    assert_eq(pd.rolling_kurt(p, 3),
              dd.rolling_kurt(d, 3),
              check_less_precise=True)
    assert_eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5))
    assert_eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad))
    assert_eq(pd.rolling_window(p, 3, win_type='boxcar'),
              dd.rolling_window(d, 3, win_type='boxcar'))
    # Test with edge-case window sizes
    assert_eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0))
    assert_eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1))
    # Test with kwargs
    assert_eq(pd.rolling_sum(p, 3, min_periods=3),
              dd.rolling_sum(d, 3, min_periods=3))
def transform_DF(df,F2scores,stock_name="",Quantiles=None):
    
    #MUST HAVE 
    df["ooRelRet(nextDay)"] = df["ooRelRet"].shift(-1) #for return calculation
    df["ooRawRet(nextDay)"] = df["ooRawRet"].shift(-1)   

    if 'barraBeta' in df.columns:
        df['barraBeta'] = df['barraBeta'].fillna(method='ffill')
        df['barraBeta'] = df['barraBeta'].fillna(method='bfill')
    else:
        print 'barraBeta not available in this set'  

    for f2score in F2scores:
        df[f2score][df[f2score] <-15] = -15
        df[f2score][df[f2score] > 15] = 15 
        df[f2score+"_Scaled"] = df[f2score]*0.01 #for graphing purposes only to combine it on the same graph with RelRet
        
        #I tried rolling quantiles of 220Days, but they were ending up with F2 of 2.2 as a shorting signal. 
        #I think 2years at least should be used.
        df[f2score+"Qlower"] = pd.rolling_quantile(df[f2score],window=400,quantile=0.095)
        df[f2score+"Qupper"] = pd.rolling_quantile(df[f2score],window=400,quantile=0.905)
        df[f2score+"Qlower"] = df[f2score+"Qlower"].fillna(method='bfill')
        df[f2score+"Qupper"] = df[f2score+"Qupper"].fillna(method='bfill')
 
    """some stats that are necessary for graphing and stat analysis"""
    
    for var in ['ooRelRet','ccRelRet',]:  #'ooRawRet'
        df[var+"(Cum)"] = df[var].cumsum()
        for days in [1,2,3,5,8,10,20]:       
            df[var+"("+str(days)+"D avg)"] = pd.rolling_mean(df[var],days)
            df[var+"(next"+str(days)+"D avg)"] = pd.rolling_mean(df[var],days).shift(-(days+1))#negative shift allows to look into the future, used for graphs, but not for trading.           
            df[var+"("+str(days)+"D sum)"] = pd.rolling_sum(df[var],days)

    for days in [3,10]:  #this vector must be a subset of the vector above
        var = "ooRelRet"
        df[var+"("+str(days)+"D UWM)"] = pd.rolling_mean(df["ooRelRet("+str(days)+"D avg)"],10)

    for var in ['ccRelRet','ooPoolRet','ccPoolRet']: #'ooRawRet' 
        for days in [5,8,10,15,20]:       
            df["avg"] = pd.rolling_mean(df[var],days)
            df["std"] = pd.rolling_std(df[var],days)
            df[var+"_E("+str(days)+"D)"] = df['avg']/df['std']
    df = df.drop(["avg","std"],axis = 1)
    
    #UTILITIES.dump_data(df,stock_name,t_fn="t_"+stock_name+"_transformed.csv")    
            
    return df    
def create_data_frame_1(facility, facility_data_db):
    '''Creates data frames of facility data'''
    facility_list = [[f.date, f.location, f.new_orders, f.new_lines, f.new_units, f.new_dollars, \
               f.sched_orders, f.sched_lines, f.sched_units, f.sched_dollars, \
               f.unsched_orders, f.unsched_lines, f.unsched_units, f.unsched_dollars, \
               f.ship_orders, f.ship_lines, f.ship_units, f.ship_dollars, \
               f.susp_orders, f.susp_lines, f.susp_units, f.susp_dollars, \
               f.old_orders, f.old_lines, f.old_units, f.old_dollars, \
               f.fut_orders, f.fut_lines, f.fut_units, f.fut_dollars, \
               f.hold_orders, f.hold_lines, f.hold_units, f.hold_dollars] 
                 for f in facility_data_db.values() if f.location == facility]
    df = pd.DataFrame(facility_list, columns=['date', 'location', 'new_orders', 'new_lines', 'new_units', 'new_dollars',    
               'sched_orders', 'sched_lines', 'sched_units', 'sched_dollars',  
               'unsched_orders', 'unsched_lines', 'unsched_units', 'unsched_dollars',
               'ship_orders', 'ship_lines', 'ship_units', 'ship_dollars',   
               'susp_orders', 'susp_lines', 'susp_units', 'susp_dollars',   
               'old_orders', 'old_lines', 'old_units', 'old_dollars',    
               'fut_orders', 'fut_lines', 'fut_units', 'fut_dollars',    
               'hold_orders', 'hold_lines', 'hold_units', 'hold_dollars'])
    
    ## cast index to datetime; not automatically a datetime for some reason
    df.index = pd.to_datetime(df.date)
    df = df.sort(['date'])

    df['year'] = df["date"].apply(lambda x: datetime.date.isocalendar(x)[0])
    df['week_num'] = df["date"].apply(lambda x: datetime.date.isocalendar(x)[1])
    df['week_day'] = df["date"].apply(lambda x: datetime.date.isocalendar(x)[2])
    df['day_of_year'] = df['date'].apply(lambda d: d.toordinal() - datetime.date(d.year, 1, 1).toordinal() + 1)
    df['ship_MA10_orders'] = pd.rolling_quantile(df['ship_orders'], 5, 0.75)
    df['ship_MA10_lines'] = pd.rolling_quantile(df['ship_lines'], 5, 0.75)
    df['ship_MA10_units'] = pd.rolling_quantile(df['ship_units'], 5, 0.75)
    df['ship_MA10_dollars'] = pd.rolling_quantile(df['ship_dollars'],5, 0.75)
    df['in_process_orders'] = df['sched_orders'] + df['unsched_orders'] + df['old_orders'] + df['fut_orders'] + df['hold_orders']
    df['in_process_lines'] = df['sched_lines'] + df['unsched_lines'] + df['old_lines'] + df['fut_lines'] + df['hold_lines']
    df['in_process_units'] = df['sched_units'] + df['unsched_units'] + df['old_units'] + df['fut_units'] + df['hold_units']
    df['in_process_dollars'] = df['sched_dollars'] + df['unsched_dollars'] + df['old_dollars'] + df['fut_dollars'] + df['hold_dollars']
    df['backlog_orders'] = df['in_process_orders'].div(df['ship_MA10_orders'])
    df['backlog_lines'] = df['in_process_lines'].div(df['ship_MA10_lines'])
    df['backlog_units'] = df['in_process_units'].div(df['ship_MA10_units'])
    df['backlog_dollars'] = df['in_process_dollars'].div(df['ship_MA10_dollars'])
    df['units_per_line'] = pd.rolling_mean(df.new_units, 10) / pd.rolling_mean(df.new_lines, 10)
    df['lines_per_order'] = pd.rolling_mean(df.new_lines, 10) / pd.rolling_mean(df.new_orders, 10)
    df['dollars_per_unit'] = pd.rolling_mean(df.new_dollars, 10) / pd.rolling_mean(df.new_units, 10) * 1000
    df['dollars_per_order'] = pd.rolling_mean(df.new_dollars, 10) / pd.rolling_mean(df.new_orders, 10) * 1000
  
    return df        
    def rollingStats(self, selectCol = [], splitCol=None, sepCol=None, startTime=None, endTime=None, window=60, quantile=0.1, freq='10s', min_periods=5 ):
        
        df = self.dfSetup()
        
        ## Selects a list of columns to use and splits a column into single type if it contains more than one
        # eg. if a file contains multiple sensor readings 
        if (len(selectCol) > 0):
            dfSub = df[selectCol]
            
        else:
            dfSub = df
        
        if (splitCol and sepCol):
            dfSub = dfSub[dfSub[splitCol] == sepCol]
        
        ## Converts datetime column to datatime object index, then use it to create time slices
        # Time format '2015-10-17 09:00:00' May use the dfOther to use other data frames
        if (startTime and endTime):
            dfSub = dfSub[ startTime : endTime ]
        
        else:
            dfSub = dfSub
        
        if (splitCol):
            dfSub = dfSub.drop(splitCol, axis=1) # Remove columns used to split entries
        
        
        valueName = dfSub.columns.values[0]
        outList = []
        
        counts = pd.rolling_count(dfSub,window,freq=freq).rename(columns = {valueName:'rolling_counts'})
        outList.append(counts)
        
        means = pd.rolling_mean(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_mean'})
        outList.append(means)
        
        rms = np.sqrt(pd.rolling_mean(dfSub**2, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_rms'}) )
        outList.append(rms)
        
        medians = pd.rolling_median(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_median'})
        outList.append(medians)
        
        stds = pd.rolling_std(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_std'})
        outList.append(stds)
        
        mins = pd.rolling_min(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_min'})
        outList.append(mins)
        
        maxs = pd.rolling_max(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_max'})
        outList.append(maxs)
        
        quants = pd.rolling_quantile(dfSub, window, quantile, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_quantile'})
        outList.append(quants)

        
        dfOut = pd.concat(outList, axis=1)

        return dfOut
Example #14
0
def robust_vol_calc(x, days=35, min_periods=10, vol_abs_min=0.0000000001, vol_floor=True,
                    floor_min_quant=0.05, floor_min_periods=100,
                    floor_days=500):
    """
    Robust exponential volatility calculation, assuming daily series of prices
    We apply an absolute minimum level of vol (absmin);
    and a volfloor based on lowest vol over recent history

    :param x: data
    :type x: Tx1 pd.DataFrame

    :param days: Number of days in lookback (*default* 35)
    :type days: int

    :param min_periods: The minimum number of observations (*default* 10)
    :type min_periods: int

    :param vol_abs_min: The size of absolute minimum (*default* =0.0000000001) 0.0= not used
    :type absmin: float or None

    :param vol_floor Apply a floor to volatility (*default* True)
    :type vol_floor: bool
    :param floor_min_quant: The quantile to use for volatility floor (eg 0.05 means we use 5% vol) (*default 0.05)
    :type floor_min_quant: float
    :param floor_days: The lookback for calculating volatility floor, in days (*default* 500)
    :type floor_days: int
    :param floor_min_periods: Minimum observations for floor - until reached floor is zero (*default* 100)
    :type floor_min_periods: int

    :returns: pd.DataFrame -- volatility measure


    """

    # Standard deviation will be nan for first 10 non nan values
    vol = pd.ewmstd(x, span=days, min_periods=min_periods)

    vol[vol < vol_abs_min] = vol_abs_min

    if vol_floor:
        # Find the rolling 5% quantile point to set as a minimum
        vol_min = pd.rolling_quantile(
            vol, floor_days, floor_min_quant, floor_min_periods)
        # set this to zero for the first value then propogate forward, ensures
        # we always have a value
        vol_min.set_value(vol_min.index[0], vol_min.columns[0], 0.0)
        vol_min = vol_min.ffill()

        # apply the vol floor
        vol_with_min = pd.concat([vol, vol_min], axis=1)
        vol_floored = vol_with_min.max(axis=1, skipna=False).to_frame()
    else:
        vol_floored = vol

    vol_floored.columns = ["vol"]
    return vol_floored
Example #15
0
    def outliers(self, **kwargs):
        sigmas = kwargs.get('sigma', None)
        print(kwargs)
        outliers = {}
        if sigmas:
            sigmas = sigmas * 3 if len(sigmas) == 1 else sigmas
            inds = []
            for sigma, col in zip(sigmas, self.columns):
                ind = self.df['%s_sigma' % col] > sigma
                inds.append(ind)
            ind = np.logical_or.reduce(np.array(inds))
            # ind = np.logical_or(inds[0], np.logical_or(inds[1], inds[2]))
            sigma_error_index = self.df.index[ind]
            # self.df.ix[sigma_error_index] = np.nan
            outliers['sigma'] = sigma_error_index
        iqr_factor = kwargs.get('iqr_factor', None)
        if iqr_factor:
            window = kwargs['iqr_window']
            inds = []
            results = self.fit(**kwargs)
            residual_df = results['residual']
            residual_df.resample(self.freq)
            self.interpolate(residual_df)
            for col in self.columns:
                residual = residual_df[col]
                median = pd.rolling_median(residual, window)
                q75 = pd.rolling_quantile(residual, window, 0.75)
                q25 = pd.rolling_quantile(residual, window, 0.25)
                qrange = iqr_factor * (q75 - q25)
                low = median - qrange
                high = median + qrange
                ind = np.logical_or(residual.values < low.values,
                                    residual.values > high.values)
                ind2 = (residual -
                        residual.mean()).abs() > iqr_factor * residual.std()
                ind = np.logical_or(ind, ind2)
                inds.append(ind)
            # ind = np.logical_or(inds[0], np.logical_or(inds[1], inds[2]))
            ind = np.logical_or.reduce(np.array(inds))

            iqr_error_index = residual_df.index[ind]
            outliers['iqr'] = iqr_error_index
        return outliers
Example #16
0
def rolling_quantile(x, width, quantile):
    """Rolling quantile (0--1) with mirrored edges."""
    x, wing = check_inputs(x, width)
    # Pad the edges of the original array with mirror copies
    signal = np.concatenate((x[wing-1::-1], x, x[:-wing-1:-1]))
    with warnings.catch_warnings():
        # NB: in pandas 0.18+ this function is deprecated
        warnings.simplefilter("ignore", FutureWarning)
        rolled = pd.rolling_quantile(signal, 2 * wing + 1, quantile,
                                     center=True)
    return rolled[wing:-wing]
Example #17
0
    def get_quantile_outliers(self,
                              file_name=TEST_FILE,
                              quantile=0.05,
                              rolling_window_size=DEFAULT_ROLLING_WINDOW_SIZE,
                              chunksize=DEFAULT_CHUNK_SIZE):
        """Computes quantile-based outliers in the input data sed

        :param file_name: Input data set file name
        :type file_name: str
        :param quantile: input quantile. Default value: 0.05 (5%)
        :type quantile: float
        :param rolling_window_size: Rolling window size
        :type rolling_window_size: int
        :param chunksize: Input file reading chunk size
        :type chunksize: int
        :return: A tuple of two numpy arrays containing low/high end outliers
        :rtype: tuple
        """
        anom_min = pd.DataFrame()
        anom_max = pd.DataFrame()
        data_anom_max = pd.DataFrame()
        data_anom_min = pd.DataFrame()
        for chunk in pd.read_csv(file_name,
                                 chunksize=chunksize,
                                 date_parser=True):
            cpu_usage_min = pd.rolling_quantile(chunk.cpu_usage,
                                                window=rolling_window_size,
                                                quantile=quantile)
            cpu_usage_max = pd.rolling_quantile(chunk.cpu_usage,
                                                window=rolling_window_size,
                                                quantile=1 - quantile)
            # print(cpu_usage_max)
            anom_min = chunk.loc[chunk.cpu_usage < cpu_usage_min,
                                 ['time', 'cpu_usage']]
            anom_max = chunk.loc[chunk.cpu_usage > cpu_usage_max,
                                 ['time', 'cpu_usage']]
            data_anom_min = data_anom_min.append(anom_min).dropna()
            data_anom_max = data_anom_max.append(anom_max).dropna()
            print(data_anom_max)
        return data_anom_max, data_anom_min
Example #18
0
def rolling_quantile(x, width, quantile):
    """Rolling quantile (0--1) with mirrored edges."""
    x, wing = check_inputs(x, width)
    # Pad the edges of the original array with mirror copies
    signal = np.concatenate((x[wing - 1::-1], x, x[:-wing - 1:-1]))
    with warnings.catch_warnings():
        # NB: in pandas 0.18+ this function is deprecated
        warnings.simplefilter("ignore", FutureWarning)
        rolled = pd.rolling_quantile(signal,
                                     2 * wing + 1,
                                     quantile,
                                     center=True)
    return rolled[wing:-wing]
Example #19
0
def extract_features_group(df, columns, win_size):
    #df_mean = df.groupby('id')[columns].apply(pd.rolling_mean, win_size, min_periods=1)
    df_mean = df.groupby('id')[columns].rolling(
        window=win_size, min_periods=1,
        center=False).mean().reset_index().drop(['id', 'level_1'], axis=1)

    #df_std = df.groupby('id')[columns].apply(pd.rolling_std, win_size, min_periods=1)
    df_std = df.groupby('id')[columns].rolling(
        window=win_size, min_periods=1,
        center=False).std().reset_index().drop(['id', 'level_1'], axis=1)

    df_std = df_std.fillna(0)
    #df_median = df.groupby('id')[columns].apply(pd.rolling_median, win_size, min_periods=1)
    df_median = df.groupby('id')[columns].rolling(
        window=win_size, min_periods=1,
        center=False).median().reset_index().drop(['id', 'level_1'], axis=1)

    #df_min = df.groupby('id')[columns].apply(pd.rolling_min, win_size, min_periods=1)
    df_min = df.groupby('id')[columns].rolling(
        window=win_size, min_periods=1,
        center=False).min().reset_index().drop(['id', 'level_1'], axis=1)

    #df_max = df.groupby('id')[columns].apply(pd.rolling_max, win_size, min_periods=1)
    df_max = df.groupby('id')[columns].rolling(
        window=win_size, min_periods=1,
        center=False).max().reset_index().drop(['id', 'level_1'], axis=1)

    df_quantile = df.groupby('id')[columns].apply(
        lambda x: pd.rolling_quantile(x, win_size, 0.9, min_periods=1))

    df_rms = df.groupby('id')[columns].apply(pd.rolling_apply,
                                             win_size,
                                             lambda x: RMS(x),
                                             min_periods=1)
    #df_rms = df.groupby('id')[columns].rolling(window=win_size,center=False,min_periods=1).apply(func= lambda x:RMS(x))

    df_energy = df.groupby('id')[columns].apply(pd.rolling_apply,
                                                win_size,
                                                lambda x: Energy(x),
                                                min_periods=1)

    df_features = pd.concat([
        df[columns], df_mean, df_std, df_median, df_max, df_min, df_quantile,
        df_rms, df_energy
    ],
                            axis=1).dropna()
    features = np.array(df_features)
    return features
Example #20
0
def robust_vol_calc(x, days=35, min_periods=10, vol_abs_min=0.0000000001, vol_floor=True,
                    floor_min_quant=0.05, floor_min_periods=100,
                    floor_days=500):
    vol = pd.ewmstd(x, span=days, min_periods=min_periods)
    vol[vol < vol_abs_min] = vol_abs_min
    if vol_floor:
        vol_min = pd.rolling_quantile(
            vol, floor_days, floor_min_quant, floor_min_periods)
        vol_min.set_value(vol_min.index[0], 0.0)
        vol_min = vol_min.ffill()
        vol_with_min = pd.concat([vol, vol_min], axis=1)
        vol_floored = vol_with_min.max(axis=1, skipna=False)
    else:
        vol_floored = vol

    return vol_floored
def extract_features(df, columns, win_size):
    df_mean = pd.rolling_mean(df[columns], win_size, min_periods=1)
    df_std = pd.rolling_std(df[columns], win_size, min_periods=1)
    df_std = df_std.fillna(0)
    df_median = pd.rolling_median(df[columns], win_size, min_periods=1)
    df_min = pd.rolling_min(df[columns], win_size, min_periods=1)
    df_max = pd.rolling_max(df[columns], win_size, min_periods=1)
    df_quantile = pd.rolling_quantile(df[columns], win_size, 0.9)
    df_rms = pd.rolling_apply(df[columns], win_size, lambda x: RMS(x))
    df_energy = pd.rolling_apply(df[columns], win_size, lambda x: Energy(x))
    df_features = pd.concat([
        df[columns], df_mean, df_std, df_median, df_max, df_min, df_quantile,
        df_rms, df_energy
    ],
                            axis=1).dropna()
    features = np.array(df_features)
    return features
Example #22
0
def rolling_tests(p, d):
    eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3))
    eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3))
    eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3))
    eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3))
    eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3))
    eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3))
    eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3))
    eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3))
    eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3))
    eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3))
    eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5))
    mad = lambda x: np.fabs(x - x.mean()).mean()
    eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad))
    eq(pd.rolling_window(p, 3, 'boxcar'), dd.rolling_window(d, 3, 'boxcar'))
    # Test with edge-case window sizes
    eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0))
    eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1))
    # Test with kwargs
    eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3, min_periods=3))
Example #23
0
def rolling_functions_tests(p, d):
    # Old-fashioned rolling API
    eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3))
    eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3))
    eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3))
    eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3))
    eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3))
    eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3))
    eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3))
    eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3))
    eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3))
    eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3))
    eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5))
    eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad))
    with ignoring(ImportError):
        eq(pd.rolling_window(p, 3, 'boxcar'), dd.rolling_window(d, 3, 'boxcar'))
    # Test with edge-case window sizes
    eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0))
    eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1))
    # Test with kwargs
    eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3, min_periods=3))
Example #24
0
def rolling_functions_tests(p, d):
    # Old-fashioned rolling API
    eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3))
    eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3))
    eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3))
    eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3))
    eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3))
    eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3))
    eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3))
    eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3))
    eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3))
    eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3))
    eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5))
    eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad))
    with ignoring(ImportError):
        eq(pd.rolling_window(p, 3, "boxcar"), dd.rolling_window(d, 3, "boxcar"))
    # Test with edge-case window sizes
    eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0))
    eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1))
    # Test with kwargs
    eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3, min_periods=3))
Example #25
0
def calc_HS_VaR(ret_df, 
                window=504, 
                min_periods=None,
                est_prob=0.01, 
                PV=1):
    '''
    Calculate Historial Simulation (HS) Value-at-Risk (VaR)

    Parameters
    ----------
    ret_df : DataFrame
        Asset or Portfolio returns
    window : int, optional
        Window used to compute VaR
    min_periods: int, optional
        Minimum number of periods in the window to compute VaR
    est_prob : float, optional
        VaR estimation probability (defaults to 1%)
    PV : float, optional
        Portfolio value or notional (defaults to 1)

    Returns
    -------
    HS_VaR_df : DataFrame
        Historical Simulation Value-at-Risk
    '''
    if window < 0:
        raise ValueError('%d is not a valid window size' % window)
    if est_prob < 0 or est_prob > 1:
        raise ValueError('%f is not a valid estimation probability' % est_prob)
    if PV < 0:
        raise ValueError('%f is not a valid portfolio value' % PV)
        
    HS_VaR_df = pd.rolling_quantile(ret_df, window, est_prob, min_periods=min_periods) * PV
    col = 'HS VaR (' + str(round(window/252)) + 'Y window, ' + str(est_prob*100) + '% probability' + ')'
    HS_VaR_df = HS_VaR_df.rename(columns={'Portfolio':col})
    
    return HS_VaR_df
Example #26
0
def rolling_tests(p, d):
    eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3))
    eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3))
    eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3))
    eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3))
    eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3))
    eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3))
    eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3))
    eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3))
    eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3))
    eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3))
    eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5))
    mad = lambda x: np.fabs(x - x.mean()).mean()
    eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad))
    with ignoring(ImportError):
        eq(pd.rolling_window(p, 3, 'boxcar'),
           dd.rolling_window(d, 3, 'boxcar'))
    # Test with edge-case window sizes
    eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0))
    eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1))
    # Test with kwargs
    eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3,
                                                           min_periods=3))
def ts_quantileFn(arr, q, min_periods, max_periods):
    if not (max_periods): max_periods = len(arr)
    return pd.rolling_quantile(arr,
                               max_periods,
                               min_periods=min_periods,
                               quantile=q)
Example #28
0
def VaR(symbol='AAPL',
        notl=None,
        conf=0.95,
        dist=None,
        _d1=None,
        _d2=None,
        volwindow=50,
        varwindow=250):
    # Retrieve the data from Internet
    # Choose a time period
    d1 = _d1 if _d1 else datetime.datetime(2001, 1, 1)
    d2 = _d2 if _d2 else datetime.datetime(2012, 1, 1)
    #get the tickers
    price = DataReader(symbol, "yahoo", d1, d2)['Adj Close']
    price = price.asfreq('B').fillna(method='pad')
    ret = price.pct_change()

    #choose the quantile
    quantile = 1 - conf

    import pdb
    pdb.set_trace()
    #simple VaR using all the data
    # VaR on average accross all the data
    unnormedquantile = pd.expanding_quantile(ret, quantile)

    # similar one using a rolling window
    # VaR only calculated over the varwindow, rolling
    unnormedquantileR = pd.rolling_quantile(ret, varwindow, quantile)

    #we can also normalize the returns by the vol
    vol = pd.rolling_std(ret, volwindow) * np.sqrt(256)
    unitvol = ret / vol

    #and get the expanding or rolling quantiles
    # Same calcs as above except normalized so show VaR in
    # standard deviations instead of expected returns
    Var = pd.expanding_quantile(unitvol, quantile)
    VarR = pd.rolling_quantile(unitvol, varwindow, quantile)

    normedquantile = Var * vol
    normedquantileR = VarR * vol

    ret2 = ret.shift(-1)
    courbe = pd.DataFrame({
        'returns': ret2,
        'quantiles': unnormedquantile,
        'Rolling quantiles': unnormedquantileR,
        'Normed quantiles': normedquantile,
        'Rolling Normed quantiles': normedquantileR,
    })

    courbe['nqBreak'] = np.sign(ret2 - normedquantile) / (-2) + 0.5
    courbe['nqBreakR'] = np.sign(ret2 - normedquantileR) / (-2) + 0.5
    courbe['UnqBreak'] = np.sign(ret2 - unnormedquantile) / (-2) + 0.5
    courbe['UnqBreakR'] = np.sign(ret2 - unnormedquantileR) / (-2) + 0.5

    nbdays = price.count()
    print('Number of returns worse than the VaR')
    print('Ideal Var                : ', (quantile) * nbdays)
    print('Simple VaR               : ', np.sum(courbe['UnqBreak']))
    print('Normalized VaR           : ', np.sum(courbe['nqBreak']))
    print('---------------------------')
    print('Ideal Rolling Var        : ', (quantile) * (nbdays - varwindow))
    print('Rolling VaR              : ', np.sum(courbe['UnqBreakR']))
    print('Rolling Normalized VaR   : ', np.sum(courbe['nqBreakR']))
Example #29
0
family_children['DIFF'] = (family_children['FIRST_CHILD'] - family_children['MARR_DATE']).dt.days

# from 1790 to 1856
# family_children = family_children[family_children.MARR_DATE > datetime.date(1790, 1, 1)]
# family_children = family_children[family_children.MARR_DATE < datetime.date(1856, 1, 1)]

# FIXME: simplify
final = family_children[family_children.columns]
final = final.set_index('FIRST_CHILD')
final = final.sort_index()

#final = final[final.DIFF < 1095]

nine_months = 274

q10 = pd.rolling_quantile(final['DIFF'], 40, 0.1)
q50 = pd.rolling_quantile(final['DIFF'], 40, 0.5)
q90 = pd.rolling_quantile(final['DIFF'], 40, 0.9)

above = final[final.DIFF > nine_months]
bellow = final[final.DIFF <= nine_months]

#final.groupby('FATHER_LINE').size().sort_values()
group = final[final.FATHER_LINE == 73863]

# %matplotlib inline
sbn.set_style('ticks')

plt.figure()
# plt.plot(above.index, above.DIFF, marker='o', color='0.75', linestyle='')
# plt.plot(bellow.index, bellow.DIFF, marker='o', color='0.5',linestyle='')
Example #30
0
#
# full_df['r']=np.log(full_df['open'])
# full_df['r']=full_df['r'].diff()


full_df['c_ma']=pd.rolling_mean(full_df['close'],5)
full_df['o_ma']=pd.rolling_mean(full_df['open'],5)
full_df['cftc_ma']=pd.rolling_mean(full_df['cftc'],5)

full_df['c_ma_diff']=full_df['c_ma'].diff()
full_df['o_ma_diff']=full_df['o_ma'].diff()
full_df['cftc_ma_diff']=full_df['cftc_ma'].diff()



full_df['c_up_thr']=pd.rolling_quantile(full_df['c_ma_diff'],100,0.6)
full_df['c_low_thr']=pd.rolling_quantile(full_df['c_ma_diff'],100,0.4)
full_df['o_up_thr']=pd.rolling_quantile(full_df['o_ma_diff'],100,0.6)
full_df['o_low_thr']=pd.rolling_quantile(full_df['o_ma_diff'],100,0.4)
full_df['cftc_up_thr']=pd.rolling_quantile(full_df['cftc_ma_diff'],100,0.6)
full_df['cftc_low_thr']=pd.rolling_quantile(full_df['cftc_ma_diff'],100,0.4)

def cc_2(x,u,l):
    if x>=u:
        return 1
    elif x<=l:
        return -1
    else:
        return 0

full_df['close_ma_sig']=map(cc_2,full_df['c_ma_diff'],full_df['c_up_thr'],full_df['c_low_thr'])
large_data=False
remove_baseline=False
window_baseline=700
quantile_baseline=.1
#%%
reload=0
filename='movies/demoMovie.tif'
if not reload:
    t = tifffile.TiffFile(filename) 
    Y = t.asarray().astype(dtype=np.float32) 
    Y = np.transpose(Y,(1,2,0))
    d1,d2,T=Y.shape
    Yr=np.reshape(Y,(d1*d2,T),order='F')
    if remove_baseline:        
        Yr_begin=Yr[:,:99].copy()
        Yr=Yr-pd.rolling_quantile(Yr.T,window_baseline,quantile_baseline,min_periods=100,center=True).T
        Yr[:,:99]=Yr_begin-np.percentile(Yr_begin,quantile_baseline*100,axis=1)[:,None]    
        Y=np.reshape(Yr,(d1,d2,T),order='F')
    np.save('Y',Y)
    np.save('Yr',Yr)
#%    
if caching:
    Y=np.load('Y.npy',mmap_mode='r')
    Yr=np.load('Yr.npy',mmap_mode='r')        
else:
    Y=np.load('Y.npy')
    Yr=np.load('Yr.npy') 
    
d1,d2,T=Y.shape
#%%
if not large_data:
Example #32
0
 def evaluate(self, table):
     expr = self.expr
     val = None
     if expr is not None:
         val = expr.evaluate(table)
     return pd.rolling_quantile(val, self.window)
Example #33
0
df = pd.read_csv(
    "../exp/src/main/java/history/XAUUSDm5",
    header=None,
    names=[
        "time", "bo", "bc", "bh", "bl", "ao", "ac", "ah", "al", "vol"],
    index_col=0,
    parse_dates=True)[::-1]

df["spread"] = df["ac"] - df["bc"]
df["ind_h"] = pd.rolling_max(df.ah, window=50).shift(1)
df["ind_h_prev"] = df.ind_h.shift(1)
df["ah_prev"] = df.ah.shift(1)
df["drawdown"] = 1 - pd.rolling_min(df.al, 20).shift(-20) / df.ac
df.drawdown = df.drawdown.shift(20)
df["sl_ratio"] = pd.rolling_quantile(df["drawdown"], 250, 0.9)
df["drawup"] = pd.rolling_max(df.ah, 20).shift(-20) / df.ac - 1
df = df["2016-03-01": "2016-03-02"]
#fig = plt.figure()
#dates = df.index
#ax = fig.add_subplot(1,1,1)
#ax.plot(dates, df.ac, dates, df.ind_h)
#ax.xaxis.set_major_locator(HourLocator(byhour=range(24), interval=4))
#ax.xaxis.set_major_formatter(DateFormatter("%Y%m%d %H"))
#ax.xaxis_date()
#plt.setp(plt.gca().get_xticklabels(), rotation=90, horizontalalignment='right')
#ax.legend(['close', 'high'])
#plt.show()

df['tp_ratio'] = pd.rolling_quantile(df.drawup, 250, 0.7)
def sliding_median_iqr(neighbors,
                       random=None,
                       compute_random=0,
                       window=1000,
                       p0=None):
    """
    Compute sliding median of spearmanr and size, interquartile range 
    and 95% CI of spearmanr of randomly paired genes

    Parameters
    ----------
    neighbors: neighboring gene pairs dataframe
    window: size of window for sliding median

    Returns
    -------
    rolling_median: sliding median of spearmanr and size with IQR for spearmanr
    median and 95% confidence interval of median from random pairs

    """
    #load dataframe if not provided yet
    if isinstance(neighbors, basestring):
        neighbors = pd.read_csv(neighbors)
    if compute_random and isinstance(random, basestring):
        random = pd.read_csv(random)

    # sort by size to do sliding window with increasing intergenic distance
    # nans cause error in sliding median
    neighbors = neighbors.sort('size').dropna()

    print 'computing sliding median...'
    # compute rolling medians. 1000 looks good, less is unnecesserily heavy and noisy.
    rolling_median_spearmanr = pd.rolling_median(neighbors.spearmanr, window)

    print 'computing IQR...'
    # compute interquartile range (IQR). Top 75% and bottom 25%.
    rolling_spearmanr_q1 =  - pd.rolling_quantile(neighbors.spearmanr, window, 0.25) + \
            rolling_median_spearmanr
    rolling_spearmanr_q3 = pd.rolling_quantile(neighbors.spearmanr, window, 0.75) - \
            rolling_median_spearmanr
    rolling_median_size = pd.rolling_median(neighbors['size'], window) / 1000

    # put it all together
    rolling_median_s = pd.DataFrame({
        'spearmanr': rolling_median_spearmanr,
        'size': rolling_median_size,
        'q1': rolling_spearmanr_q1,
        'q3': rolling_spearmanr_q3
    })

    # drop all nans from sliding median (first 1000 because of window)
    rolling_median_s = rolling_median_s.dropna()

    # reindex is necessary
    rolling_median_s.index = np.arange(len(rolling_median_s))

    if compute_random:
        print 'computing random pairs median CI'
        # compute 95% confidence interval of median in random pairs
        ci_median = bs.ci(random.spearmanr.dropna().loc[:20000], np.median)
        rolling_median_s['random_lci'] = ci_median[0]
        rolling_median_s['random_hci'] = ci_median[1]

    print 'fitting to exp decay...'
    popt_s, pcov_s = curve_fit(exp_decay,
                               rolling_median_s['size'],
                               rolling_median_s.spearmanr,
                               p0=p0)

    rolling_median_s['popt1'] = popt_s[0]
    rolling_median_s['popt2'] = popt_s[1]
    rolling_median_s['popt3'] = popt_s[2]

    print 'done'
    return rolling_median_s
Example #35
0
df['prevma'] = df.ma.shift(1)
df['std'] = pd.rolling_std(df.close, 20)
df['chg'] = df.close.pct_change()
df['chg_std'] = pd.rolling_std(df.chg, 20)
df['range'] = (df.high - df.low) / df.close.shift(1)
df['range_std'] = pd.rolling_std(df.range, 20)
df.range_std.hist(bins=20)
df['profit'] = pd.rolling_sum(df.chg, 5).shift(-5)
x_series = pd.Series(np.arange(len(df.index)), index=df.index)
df['slope'] = pd.ols(y=df.close, x=x_series, window=10).beta['x']
df.index = pd.to_datetime(df.index, format='%y%m%d')
df['deviation'] = (df['close'] - df['ma']) / df['std']
df['prevdev'] = df.deviation.shift(1)
df['min10'] = pd.rolling_min(df['low'], 10).shift(-10)
df['max10'] = pd.rolling_max(df['high'], 10).shift(-10)
df['factor'] = pd.rolling_quantile(df.deviation, 250, 0.85)
df['upperb'] = 1.5 * df['std'] + df['ma']
df['drawdown'] = (df.close - df.min10) / df.close
df['drawup'] = (df.max10 - df.close) / df.close
df['max_drawdown'] = pd.rolling_apply(df.close, 10, get_max_drawdown).shift(-10) / df.close

df = df['2012-01-01': '2014-01-01']
fig = plt.figure()
dates = df.index
ax = fig.add_subplot(2,1,1)
ax.plot(dates, df.close)
ax.xaxis.set_major_locator(WeekdayLocator(byweekday=MO, interval=2))
ax.xaxis.set_major_formatter(DateFormatter("%Y%m%d"))
ax.xaxis_date()
plt.setp(plt.gca().get_xticklabels(), rotation=90, horizontalalignment='right')
ax2 = fig.add_subplot(2,1,2)
Example #36
0
price = price.asfreq('B').fillna(method='pad')

ret = price.pct_change()

#choose the quantile
quantile = 0.05
#the vol window
volwindow = 50
#and the Var window for rolling
varwindow = 250

#simple VaR using all the data
unnormedquantile = pd.expanding_quantile(ret, quantile)

#similar one using a rolling window
unnormedquantileR = pd.rolling_quantile(ret, varwindow, quantile)

#we can also normalize the returns by the vol
vol = pd.rolling_std(ret, volwindow) * np.sqrt(256)
unitvol = ret / vol

#and get the expanding or rolling quantiles
Var = pd.expanding_quantile(unitvol, quantile)
VarR = pd.rolling_quantile(unitvol, varwindow, quantile)

normedquantile = Var * vol
normedquantileR = VarR * vol

ret2 = ret.shift(-1)

courbe = pd.DataFrame({
Example #37
0
points = points[points.Feeder == file_num].Point

DAY = pd.Timedelta(days=1)

points_sub = points.loc[(points.str.contains(r'\.PF\.'))
                        & (points.str.contains(r'_PH')) &
                        (points.str.contains(r'\.FDR\.')) &
                        (-points.str.contains(r'BKR\.'))]
df_sub = df.loc[df['Extended Id'].isin(points_sub)]
print "Shape = " + str(df_sub.shape[0])
df_sub['Time'] = pd.to_datetime(df_sub['Time'])
print "Finished to_datetime ..."

for point_id in df_sub['Extended Id'].unique():

    df_sub2 = df_sub.loc[df_sub['Extended Id'] == point_id]
    df_sub2 = df_sub2.sort_values(by='Time')
    print "Shape2 = " + str(df_sub2.shape[0])
    window = _get_window(df_sub2, 24)
    if window == -1:
        continue
    anoms = df_sub2[(df_sub2.Value.abs() < 0.75) & (pd.rolling_quantile(
        df_sub2.Value.abs(), window, 0.01) > 0.8)].Time.tolist()
    anoms = np.array(anoms)
    anoms = [
        e for e in anoms if df_sub2[(df_sub2.Time <= e)
                                    & (df_sub2.Time > (e - DAY))].shape[0] > 24
    ]
    df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)]
    print "\n\nNUM ANOMS = " + str(len(anoms)) + "\n\n"
Example #38
0
def rolling_loess_median(data, window=240, threshold=3):
    """
    Flags anomalous flow observations beased on their deviation from expectec value.

    Parameters
    ----------
    data : 
        Raw data .csv file
        
    window : int
        Size of moving window (number of historical values to be used to classify most recent flow point)
        Default = 240
        
    threshold :
        An anomaly will be classified if greater than Q75 + threshold * IQR or less than Q25 - threshold * IQR
        Default = 3

    Returns
    -------
    A dataframe containing detected anomalies:
    	gage_id		: Unique gauge identification
    	date_time	: Date and time the reading was taken in format year-month-day-hour-minute-second. Example: `2016-05-08T20:36:00Z`
    	flow		: Flow rate measured at the gauge in $m^3/s$
    	water_lev	: Water level
    	anomaly		: Classification (Detected anamlies)
    
    """

    # Load data
    # headers = ["gauge_id","date_time","flow","water_lev","del"]
    # df = pd.read_csv(data, names=headers)
    df = pd.read_csv(data)

    # Error handling
    # if data is None or (not isinstance(data,pd.DataFrame)):
    # raise TypeError("Input data must be a dataframe")
    if window <= 0:
        raise ValueError("Window size should be positive")
    if threshold <= 0:
        raise ValueError("threshold should be positive")

    # Arrange data by date
    df = df.sort_values('date_time')

    # Uncomment if you want to filter for a specific year
    #     df['std_date'] = pd.to_datetime(df['date_time'])
    #     df['year'] = df['std_date'].dt.year
    #     df = df.loc[(df['year'] == 2016)]

    # Converting flow column into series
    series = pd.Series(df["flow"])
    series = series.to_frame('flow')

    # Computing rolling median, quantiles and Inter Quartile Range (IQR)
    df['median'] = series.rolling(window).median()
    df['q25'] = pd.rolling_quantile(series, window, 0.25)
    df['q75'] = pd.rolling_quantile(series, window, 0.75)
    df['iq_range'] = df['q75'] - df['q25']

    # Setting up boundaries of range (based on number of IQRs)
    df['b_high_upper'] = df['q75'] + threshold * df['iq_range']
    df['b_high_lower'] = df['q25'] - threshold * df['iq_range']

    df['b_med_upper'] = df['q75'] + threshold * df['iq_range']
    df['b_med_lower'] = df['q25'] - threshold * df['iq_range']

    # Classifying points as anomalies or not
    df['anomaly'] = np.where(
        (df['flow'] > df['b_high_upper']) | (df['flow'] < df['b_high_lower']),
        1, 0)

    # If IQR range = 0, dont mark them as anomalies.
    mask = df['iq_range'] == 0
    df.loc[mask, 'anomaly'] = 0

    df_anomaly = df[['gage_id', 'date_time', 'flow', 'water_lev',
                     'anomaly']].loc[df['anomaly'] == 1]

    return df_anomaly
Example #39
0
    def rolling_quantiles(self, window=30, quantiles=[0.25, 0.75]):
        '''Plots rolling quantiles of volatility
        
        Parameters
        ----------
        window : int
            Rolling window for which to calculate the estimator
        quantiles : [lower, upper]
            List of lower and upper quantiles for which to plot
        '''
        if len(quantiles) != 2:
            raise ValueError('A two element list of quantiles is required, lower and upper')
        if quantiles[0] + quantiles[1] != 1.0:
            raise ValueError('The sum of the quantiles must equal 1.0')
        if quantiles[0] > quantiles[1]:
            raise ValueError('The lower quantiles (first element) must be less than the upper quantile (second element)')
        
        estimator = self._get_estimator(window)
        date = estimator.index
        top_q = pandas.rolling_quantile(estimator, window, quantiles[1])
        median = pandas.rolling_median(estimator, window)
        bottom_q = pandas.rolling_quantile(estimator, window, quantiles[0])
        realized = estimator
        last = estimator[-1]

        if self._type is "Skew" or self._type is "Kurtosis":
            f = lambda x: "%i" % round(x, 0)
        else:
            f = lambda x: "%i%%" % round(x*100, 0)

        '''
        Figure args
        '''
        
        fig = plt.figure(figsize=(8, 6))
        
        left, width = 0.07, 0.65
        bottom, height = 0.2, 0.7
        bottom_h = left_h = left+width+0.02
        
        rect_cones = [left, bottom, width, height]
        rect_box = [left_h, bottom, 0.17, height]
        
        cones = plt.axes(rect_cones)
        box = plt.axes(rect_box)
        
        '''
        Cones plot args
        '''
        
        # set the plots
        cones.plot(date, top_q, label=str(int(quantiles[1]*100)) + " Prctl")
        cones.plot(date, median, label="Median")
        cones.plot(date, bottom_q, label=str(int(quantiles[0]*100)) + " Prctl")
        cones.plot(date, realized, 'r-.', label="Realized")
        
        # set and format the y-axis labels
        locs = cones.get_yticks()
        cones.set_yticklabels(map(f, locs))
        
        # turn on the grid
        cones.grid(True, axis='y', which='major', alpha=0.5)
        
        # set the title
        cones.set_title(self._type + ' (' + self._ticker + ', daily ' + self._start.strftime("%Y-%m-%d") + ' to ' + self._end.strftime("%Y-%m-%d") +  ')')
        
        # set the legend
        cones.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=3)
        
        '''
        Box plot args
        '''
        
        # set the plots
        box.boxplot(realized, notch=1, sym='+')
        box.plot(1, last, color='r', marker='*', markeredgecolor='k')
        
        # set and format the y-axis labels
        locs = box.get_yticks()
        box.set_yticklabels(map(f, locs))
        
        # move the y-axis ticks on the right side
        box.yaxis.tick_right()
        
        # turn on the grid
        box.grid(True, axis='y', which='major', alpha=0.5)
        
        return fig, plt
def sliding_median_iqr(neighbors, random=None, compute_random=0, window=1000, p0=None):
    """
    Compute sliding median of spearmanr and size, interquartile range 
    and 95% CI of spearmanr of randomly paired genes

    Parameters
    ----------
    neighbors: neighboring gene pairs dataframe
    window: size of window for sliding median

    Returns
    -------
    rolling_median: sliding median of spearmanr and size with IQR for spearmanr
    median and 95% confidence interval of median from random pairs

    """
    #load dataframe if not provided yet
    if isinstance(neighbors , basestring): 
        neighbors  = pd.read_csv(neighbors)
    if compute_random and isinstance(random , basestring): 
        random  = pd.read_csv(random)

    # sort by size to do sliding window with increasing intergenic distance
    # nans cause error in sliding median
    neighbors  = neighbors.sort('size').dropna()

    print 'computing sliding median...'
    # compute rolling medians. 1000 looks good, less is unnecesserily heavy and noisy.
    rolling_median_spearmanr = pd.rolling_median(neighbors.spearmanr, window)

    print 'computing IQR...'
    # compute interquartile range (IQR). Top 75% and bottom 25%.
    rolling_spearmanr_q1 =  - pd.rolling_quantile(neighbors.spearmanr, window, 0.25) + \
            rolling_median_spearmanr 
    rolling_spearmanr_q3 = pd.rolling_quantile(neighbors.spearmanr, window, 0.75) - \
            rolling_median_spearmanr 
    rolling_median_size = pd.rolling_median(neighbors['size'], window)/1000

    # put it all together
    rolling_median_s = pd.DataFrame({'spearmanr': rolling_median_spearmanr, 
        'size':rolling_median_size, 'q1': rolling_spearmanr_q1, 'q3': rolling_spearmanr_q3})

    # drop all nans from sliding median (first 1000 because of window)
    rolling_median_s = rolling_median_s.dropna()

    # reindex is necessary
    rolling_median_s.index = np.arange(len(rolling_median_s))

    if compute_random:
        print 'computing random pairs median CI'
        # compute 95% confidence interval of median in random pairs
        ci_median = bs.ci(random.spearmanr.dropna().loc[:20000], np.median)
        rolling_median_s['random_lci'] = ci_median[0]
        rolling_median_s['random_hci'] = ci_median[1]

    print 'fitting to exp decay...'
    popt_s, pcov_s = curve_fit(exp_decay, rolling_median_s['size'], 
            rolling_median_s.spearmanr, p0=p0)

    rolling_median_s['popt1'] = popt_s[0]
    rolling_median_s['popt2'] = popt_s[1]
    rolling_median_s['popt3'] = popt_s[2]

    print 'done'
    return rolling_median_s
Example #41
0
def VaR_df(df,winsz,qtile): 
    tmpdf = daily_returns_df(df)
    return pd.rolling_quantile(tmpdf,winsz,qtile)  
Example #42
0
    def extract(self, df, start_time=None, end_time=None):
        """Begin anomaly extraction on ``df``.

        Parameters
        ----------
        df : Pandas DataFrame
            The time-series data.

        start_time : datetime.datetime or None, optional (default=None)
            The time to begin extracting anomalies. If None, the entire
            time-series will be used.

        end_time : datetime.datetime or None, optional (default=None)
            The time to stop extracting anomalies. If None, the entire
            time-series will be used.
        """
        edna_cols = ['ExtendedId', 'Value', 'ValueString', 'Time', 'Status']
        self._check_df(df, edna_cols)
        self._check_anomalies()

        def get_fdr(id_str):
            """Get the feeder ID from the eDNA point name."""
            fdr_pattern = re.compile(r'[\._][0-9]{6}[\._]')
            try:
                fdr = fdr_pattern.findall(id_str)[0][1:7]
            except IndexError:
                fdr = 'NULL'
            return fdr

        points = pd.DataFrame({'Point': df.ExtendedId.unique()})
        points = points[-points.Point.str.contains('Bad point')]
        points['Feeder'] = points.Point.map(get_fdr)
        points = points[points.Feeder == self.feeder_id].Point

        df = df.loc[df.ExtendedId.isin(set(points.values))]
        df = df.drop_duplicates(subset=edna_cols)

        if 'FCI_FAULT_ALARM' in self.anomalies:

            points_sub = points.loc[(points.str.contains(r'\.FCI\.'))
                                    & (points.str.contains(r'\.FAULT'))]
            df_sub = df.loc[(df.ExtendedId.isin(points_sub))
                            & (df.ValueString != 'NORMAL')]

            self._store(df_sub, 'FCI_FAULT_ALARM', start_time, end_time)

        if set(self.anomalies) & {'FCI_I_FAULT_FULL', 'FCI_I_FAULT_TEMP'}:

            points_sub = points.loc[(points.str.contains(r'\.FCI\.'))
                                    & (points.str.contains(r'\.I_FAULT'))]
            df_sub = df.loc[(df.ExtendedId.isin(points_sub))
                            & (df.Value >= 600.)].copy()
            df_sub['Anomaly'] = 'FCI_I_FAULT_FULL'
            df_sub.loc[df.Value < 900., 'Anomaly'] = 'FCI_I_FAULT_TEMP'

            self._store(df_sub, None, start_time, end_time)

        if 'AFS_ALARM_ALARM' in self.anomalies:

            points_sub = points.loc[(points.str.contains(r'\.AFS\.'))
                                    & (points.str.contains(r'\.ALARM'))]
            df_sub = df.loc[(df.ExtendedId.isin(points_sub))
                            & (df.ValueString == 'ALARM')]

            self._store(df_sub, 'AFS_ALARM_ALARM', start_time, end_time)

        if 'AFS_GROUND_ALARM' in self.anomalies:

            points_sub = points.loc[(points.str.contains(r'\.AFS\.'))
                                    & (points.str.contains(r'\.GROUND'))]
            df_sub = df.loc[(df.ExtendedId.isin(points_sub))
                            & (df.ValueString == 'ALARM')]

            self._store(df_sub, 'AFS_GROUND_ALARM', start_time, end_time)

        if set(self.anomalies) & {'AFS_I_FAULT_FULL', 'AFS_I_FAULT_TEMP'}:

            points_sub = points.loc[(points.str.contains(r'\.AFS\.'))
                                    & (points.str.contains(r'\.I_FAULT'))]
            df_sub = df.loc[(df.ExtendedId.isin(points_sub))
                            & (df.Value >= 600.)].copy()
            df_sub['Anomaly'] = 'AFS_I_FAULT_FULL'
            df_sub.loc[df.Value < 900., 'Anomaly'] = 'AFS_I_FAULT_TEMP'

            self._store(df_sub, None, start_time, end_time)

        # Okay to remove SET/NOT-SET rows now, those just matter for AFSs/FCIs
        df = df.loc[df.Status == 'OK']

        if 'ZERO_CURRENT_V3' in self.anomalies:

            points_sub = points.loc[(points.str.contains(r'\.I\.'))
                                    & (points.str.contains(r'_PH')) &
                                    (points.str.contains(r'\.FDR\.')) &
                                    (-points.str.contains(r'BKR\.'))]
            df_sub = df.loc[df.ExtendedId.isin(points_sub)]

            for point_id in df_sub.ExtendedId.unique():

                df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id]
                df_sub2 = df_sub2.sort_values(by='Time')
                window = self._get_window(df_sub2, 24)
                if window == -1:
                    continue
                anoms = df_sub2[(df_sub2.Value < 1) & (
                    df_sub2.Value > -0.5) & (pd.rolling_quantile(
                        df_sub2.Value, window, 0.01) > 10)].Time.tolist()
                anoms = np.array(anoms)

                anoms = [
                    e for e in anoms
                    if df_sub2[(df_sub2.Time <= e)
                               & (df_sub2.Time > (e - DAY))].shape[0] > 24
                ]
                df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)]

                self._store(df_sub2, 'ZERO_CURRENT_V3', start_time, end_time)

        if 'ZERO_CURRENT_V4' in self.anomalies:

            points_sub = points.loc[(points.str.contains(r'\.I\.'))
                                    & (points.str.contains(r'_PH')) &
                                    (points.str.contains(r'\.FDR\.')) &
                                    (-points.str.contains(r'BKR\.'))]
            df_sub = df.loc[df.ExtendedId.isin(points_sub)]

            for point_id in df_sub.ExtendedId.unique():

                df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id]
                df_sub2 = df_sub2.sort_values(by='Time')

                df_sub2['LowValue'] = (df_sub2.Value < 1) & (df_sub2.Value >
                                                             -0.5)
                df_sub2['OkayValue'] = df_sub2.Value >= 1
                df_sub2['OkayValue'] = df_sub2.OkayValue.shift()
                df_sub2.OkayValue = df_sub2.OkayValue.fillna(False)
                df_sub2['ZeroValue'] = df_sub2.LowValue & df_sub2.OkayValue
                anoms = df_sub2[df_sub2.ZeroValue].Time.tolist()
                anoms = [
                    e for e in anoms
                    if df_sub2[(df_sub2.OkayValue) & (df_sub2.Time <= e)
                               & (df_sub2.Time > (e - DAY))].shape[0] > 24
                ]
                df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)]

                self._store(df_sub2, 'ZERO_CURRENT_V4', start_time, end_time)

        if 'PF_SPIKES_V3' in self.anomalies:

            points_sub = points.loc[(points.str.contains(r'\.PF\.'))
                                    & (points.str.contains(r'_PH')) &
                                    (points.str.contains(r'\.FDR\.')) &
                                    (-points.str.contains(r'BKR\.'))]
            df_sub = df.loc[df.ExtendedId.isin(points_sub)]

            for point_id in df_sub.ExtendedId.unique():

                df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id]
                df_sub2 = df_sub2.sort_values(by='Time')
                window = self._get_window(df_sub2, 24)
                if window == -1:
                    continue
                anoms = df_sub2[(df_sub2.Value.abs() < 0.75)
                                & (pd.rolling_quantile(df_sub2.Value.abs(
                                ), window, 0.01) > 0.8)].Time.tolist()
                anoms = np.array(anoms)
                anoms = [
                    e for e in anoms
                    if df_sub2[(df_sub2.Time <= e)
                               & (df_sub2.Time > (e - DAY))].shape[0] > 24
                ]
                df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)]

                self._store(df_sub2, 'PF_SPIKES_V3', start_time, end_time)

        if 'ZERO_POWER_V3' in self.anomalies:

            points_sub = points.loc[(points.str.contains(r'\.MW'))
                                    & (points.str.contains(r'\.FDR\.')) &
                                    (-points.str.contains(r'BKR\.'))]
            df_sub = df.loc[df.ExtendedId.isin(points_sub)]

            for point_id in df_sub.ExtendedId.unique():

                df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id]
                df_sub2 = df_sub2.sort_values(by='Time')
                window = self._get_window(df_sub2, 24)
                if window == -1:
                    continue
                anoms = df_sub2[(df_sub2.Value < 0.1) & (
                    df_sub2.Value > -0.5) & (pd.rolling_quantile(
                        df_sub2.Value, window, 0.01) > 0.5)].Time.tolist()
                anoms = np.array(anoms)

                anoms = [
                    e for e in anoms
                    if df_sub2[(df_sub2.Time <= e)
                               & (df_sub2.Time > (e - DAY))].shape[0] > 24
                ]
                df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)]

                self._store(df_sub2, 'ZERO_POWER_V3', start_time, end_time)

        if 'ZERO_POWER_V4' in self.anomalies:

            points_sub = points.loc[(points.str.contains(r'\.MW'))
                                    & (points.str.contains(r'\.FDR\.')) &
                                    (-points.str.contains(r'BKR\.'))]
            df_sub = df.loc[df.ExtendedId.isin(points_sub)]

            for point_id in df_sub.ExtendedId.unique():

                df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id]
                df_sub2 = df_sub2.sort_values(by='Time')

                df_sub2['LowValue'] = (df_sub2.Value < 0.1) & (df_sub2.Value >
                                                               -0.5)
                df_sub2['OkayValue'] = df_sub2.Value >= 0.1
                df_sub2['OkayValue'] = df_sub2.OkayValue.shift()
                df_sub2.OkayValue = df_sub2.OkayValue.fillna(False)
                df_sub2['ZeroValue'] = df_sub2.LowValue & df_sub2.OkayValue
                anoms = df_sub2[df_sub2.ZeroValue].Time.tolist()
                anoms = [
                    e for e in anoms
                    if df_sub2[(df_sub2.OkayValue) & (df_sub2.Time <= e)
                               & (df_sub2.Time > (e - DAY))].shape[0] > 24
                ]
                df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)]

                self._store(df_sub2, 'ZERO_POWER_V4', start_time, end_time)

        if 'THD_SPIKES_V3' in self.anomalies:

            points_sub = points.loc[(points.str.contains(r'\.THD_'))
                                    & (points.str.contains(r'urrent'))]
            df_sub = df.loc[df.ExtendedId.isin(points_sub)]

            for point_id in df_sub.ExtendedId.unique():

                df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id]
                df_sub2 = df_sub2.sort_values(by='Time')
                window = self._get_window(df_sub2, 24)
                if window == -1:
                    continue
                df_sub2['roll'] = pd.rolling_mean(df_sub2.Value, window)
                df_sub2['stdev'] = pd.rolling_std(df_sub2.Value, window)
                df_sub2['threshold'] = df_sub2.roll + (7 * df_sub2.stdev)
                df_sub2.threshold = df_sub2.threshold.shift()
                anoms = df_sub2.loc[df_sub2.threshold < df_sub2.Value]
                anoms = [
                    e for e in anoms.Time.tolist()
                    if df_sub2[(df_sub2.Time <= e)
                               & (df_sub2.Time > (e - DAY))].shape[0] > 24
                ]
                df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)]

                self._store(df_sub2, 'THD_SPIKES_V3', start_time, end_time)

        if 'ZERO_VOLTAGE_V3' in self.anomalies:

            points_sub = points.loc[(points.str.contains(r'\.V\.'))
                                    & (points.str.contains(r'_PH')) &
                                    (points.str.contains(r'\.FDR\.')) &
                                    (-points.str.contains(r'BKR\.'))]
            df_sub = df.loc[df.ExtendedId.isin(points_sub)]

            for point_id in df_sub.ExtendedId.unique():

                df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id]
                df_sub2 = df_sub2.sort_values(by='Time')
                window = self._get_window(df_sub2, 24)
                if window == -1:
                    continue
                anoms = df_sub2[(df_sub2.Value < 1) & (
                    df_sub2.Value > -0.5) & (pd.rolling_quantile(
                        df_sub2.Value, window, 0.01) > 90)].Time.tolist()
                anoms = np.array(anoms)

                anoms = [
                    e for e in anoms
                    if df_sub2[(df_sub2.Time <= e)
                               & (df_sub2.Time > (e - DAY))].shape[0] > 24
                ]
                df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)]

                self._store(df_sub2, 'ZERO_VOLTAGE_V3', start_time, end_time)

        if 'ZERO_VOLTAGE_V4' in self.anomalies:

            points_sub = points.loc[(points.str.contains(r'\.V\.'))
                                    & (points.str.contains(r'_PH')) &
                                    (points.str.contains(r'\.FDR\.')) &
                                    (-points.str.contains(r'BKR\.'))]
            df_sub = df.loc[df.ExtendedId.isin(points_sub)]

            for point_id in df_sub.ExtendedId.unique():

                df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id]
                df_sub2 = df_sub2.sort_values(by='Time')

                df_sub2['LowValue'] = (df_sub2.Value < 1) & (df_sub2.Value >
                                                             -0.5)
                df_sub2['OkayValue'] = df_sub2.Value >= 1
                df_sub2['OkayValue'] = df_sub2.OkayValue.shift()
                df_sub2.OkayValue = df_sub2.OkayValue.fillna(False)
                df_sub2['ZeroValue'] = df_sub2.LowValue & df_sub2.OkayValue
                anoms = df_sub2[df_sub2.ZeroValue].Time.tolist()
                anoms = [
                    e for e in anoms
                    if df_sub2[(df_sub2.OkayValue) & (df_sub2.Time <= e)
                               & (df_sub2.Time > (e - DAY))].shape[0] > 24
                ]
                df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)]

                self._store(df_sub2, 'ZERO_VOLTAGE_V4', start_time, end_time)

        return self
Example #43
0
def decision(date, cftc_cln, lme_df, p_df):
    date = pd.to_datetime(date)
    cftc_cln = cftc_cln[cftc_cln['report_date'] < date]
    lme_df = lme_df[lme_df.index < date]
    p_df = p_df[p_df.index < date]
    ln = 100
    if len(lme_df) >= ln and len(cftc_cln) >= 30:
        regr_df = pd.concat(
            [lme_df[['NET_COMM_PER']], cftc_cln[['non_comm_per']]],
            axis=1,
            join='inner')

        regr_df = regr_df.dropna(axis=0)

        #calculation of cftc_estimation
        X = np.array(regr_df['NET_COMM_PER'])
        X = sm.add_constant(X)
        Y = np.array(regr_df['non_comm_per'])
        X = X[-29:, ]
        Y = Y[-29:, ]
        lm = sm.OLS(Y, X)
        result = lm.fit()
        const = result.params[0]
        coef = result.params[1]

        df_all = pd.DataFrame()
        df_all['update_date'] = lme_df.index
        # lme_df['cftc']=lme_df['NET_COMM_PER']*coef+const
        # df_all['cftc']=lme_df['cftc'].values
        df_all['cftc'] = lme_df['NET_COMM_PER'].values
        df_all.index = [df_all['update_date']]
        df_all = df_all.drop(['update_date'], axis=1)
        #the analysis on lme and cftc completed, nxt deal with price(p_df)
        df_all = pd.concat([df_all, p_df], axis=1, join='inner')
        df_all['open_ma'] = pd.rolling_mean(df_all['open'], 5)
        df_all['cftc_ma'] = pd.rolling_mean(df_all['cftc'], 5)
        df_all['open_ma_diff'] = df_all['open_ma'].diff()
        df_all['cftc_ma_diff'] = df_all['cftc_ma'].diff()
        p = 0.6
        p_ = 1 - p

        df_all['o_up_thr'] = pd.rolling_quantile(df_all['open_ma_diff'], ln, p)
        df_all['o_low_thr'] = pd.rolling_quantile(df_all['open_ma_diff'], ln,
                                                  p_)
        df_all['cftc_up_thr'] = pd.rolling_quantile(df_all['cftc_ma_diff'], ln,
                                                    p)
        df_all['cftc_low_thr'] = pd.rolling_quantile(df_all['cftc_ma_diff'],
                                                     ln, p_)

        def cc_2(x, u, l):
            if x >= u:
                return 1
            elif x <= l:
                return -1
            else:
                return 0

        df_all['open_ma_sig'] = map(cc_2, df_all['open_ma_diff'],
                                    df_all['o_up_thr'], df_all['o_low_thr'])
        df_all['cftc_ma_sig'] = map(cc_2, df_all['cftc_ma_diff'],
                                    df_all['cftc_up_thr'],
                                    df_all['cftc_low_thr'])

        def sig(x, y):
            if x == y and x != 0:
                return x
            elif x * y == -1:
                return -x
            else:
                return 0

        df_all['dir'] = map(sig, df_all['open_ma_sig'], df_all['cftc_ma_sig'])
        if len(df_all) > 0:
            today_dir = df_all.iloc[-1, :]['dir']

            return today_dir
        else:
            return np.nan
    else:
        return np.nan
Example #44
0
    def rolling_quantiles(self, window=30, quantiles=[0.25, 0.75]):
        '''Plots rolling quantiles of volatility
        
        Parameters
        ----------
        window : int
            Rolling window for which to calculate the estimator
        quantiles : [lower, upper]
            List of lower and upper quantiles for which to plot
        '''
        if len(quantiles) != 2:
            raise ValueError(
                'A two element list of quantiles is required, lower and upper')
        if quantiles[0] + quantiles[1] != 1.0:
            raise ValueError('The sum of the quantiles must equal 1.0')
        if quantiles[0] > quantiles[1]:
            raise ValueError(
                'The lower quantiles (first element) must be less than the upper quantile (second element)'
            )

        estimator = self._get_estimator(window)
        date = estimator.index
        top_q = pandas.rolling_quantile(estimator, window, quantiles[1])
        median = pandas.rolling_median(estimator, window)
        bottom_q = pandas.rolling_quantile(estimator, window, quantiles[0])
        realized = estimator
        last = estimator[-1]

        if self._type is "Skew" or self._type is "Kurtosis":
            f = lambda x: "%i" % round(x, 0)
        else:
            f = lambda x: "%i%%" % round(x * 100, 0)
        '''
        Figure args
        '''

        fig = plt.figure(figsize=(8, 6))

        left, width = 0.07, 0.65
        bottom, height = 0.2, 0.7
        bottom_h = left_h = left + width + 0.02

        rect_cones = [left, bottom, width, height]
        rect_box = [left_h, bottom, 0.17, height]

        cones = plt.axes(rect_cones)
        box = plt.axes(rect_box)
        '''
        Cones plot args
        '''

        # set the plots
        cones.plot(date, top_q, label=str(int(quantiles[1] * 100)) + " Prctl")
        cones.plot(date, median, label="Median")
        cones.plot(date,
                   bottom_q,
                   label=str(int(quantiles[0] * 100)) + " Prctl")
        cones.plot(date, realized, 'r-.', label="Realized")

        # set and format the y-axis labels
        locs = cones.get_yticks()
        cones.set_yticklabels(map(f, locs))

        # turn on the grid
        cones.grid(True, axis='y', which='major', alpha=0.5)

        # set the title
        cones.set_title(self._type + ' (' + self._ticker + ', daily ' +
                        self._start.strftime("%Y-%m-%d") + ' to ' +
                        self._end.strftime("%Y-%m-%d") + ')')

        # set the legend
        cones.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=3)
        '''
        Box plot args
        '''

        # set the plots
        box.boxplot(realized, notch=1, sym='+')
        box.plot(1, last, color='r', marker='*', markeredgecolor='k')

        # set and format the y-axis labels
        locs = box.get_yticks()
        box.set_yticklabels(map(f, locs))

        # move the y-axis ticks on the right side
        box.yaxis.tick_right()

        # turn on the grid
        box.grid(True, axis='y', which='major', alpha=0.5)

        return fig, plt