def transform(self, temperatures_xray, n_burn_in, n_lookahead, skf_is): """Use world temps as features.""" # Set all temps on world map as features all_temps = temperatures_xray['tas'].values time_steps, lats, lons = all_temps.shape all_temps = all_temps.reshape((time_steps,lats*lons)) wC = 15 rolling_std = pd.rolling_std(pd.DataFrame(all_temps), window=wC, min_periods=1).values rolling_std = rolling_std[n_burn_in:-n_lookahead,:] rolling_quantileHigh = pd.rolling_quantile(pd.DataFrame(all_temps), window=wC, min_periods=1, quantile=0.99).values rolling_quantileHigh = rolling_quantileHigh[n_burn_in:-n_lookahead,:] rolling_quantileLow = pd.rolling_quantile(pd.DataFrame(all_temps), window=wC, min_periods=1, quantile=0.01).values rolling_quantileLow = rolling_quantileLow[n_burn_in:-n_lookahead,:] all_temps = all_temps[n_burn_in:-n_lookahead,:] return np.hstack((all_temps, rolling_std, rolling_quantileHigh, rolling_quantileLow))
def moving_trim_avg(df, col, t='600s', percentile=0.1): df.sort_values(by=['dataset_location', 'dataset_datetime'], inplace=True) df.index = df['dataset_datetime'] features_delta = [] for col_ in col: func_percentile = lambda x: pd.rolling_quantile( x, window=t, quantile=percentile, min_periods=0) df['quantile'] = df.groupby('dataset_location')[col_].apply( func_percentile) df['check_lowerbound'] = np.where(df[col_] > df['quantile'], df[col_], float('nan')) df['check_lowerbound'] = np.where(df[col_] <= df['quantile'], df['quantile'], df['check_lowerbound']) colname = col_ + '_moving_avg' func_mean = lambda x: pd.rolling_mean(x, window=t, min_periods=0) df[colname] = df.groupby('dataset_location')['check_lowerbound'].apply( func_mean) colname1 = col_ + '_change' df[colname1] = df[col_] - df[colname] features_delta.append(colname1) del df['check_lowerbound'] del df['quantile'] df.index = range(len(df)) return df, features_delta
def rolling_quantile(x, width, quantile): """Rolling quantile (0--1) with mirrored edges.""" x, wing = check_inputs(x, width) # Pad the edges of the original array with mirror copies signal = np.concatenate((x[wing-1::-1], x, x[:-wing-1:-1])) rolled = pd.rolling_quantile(signal, 2 * wing + 1, quantile, center=True) return rolled[wing:-wing]
def robust_vol_calc(x, days=35, min_periods=10, vol_abs_min=0.0000000001, vol_floor=True, floor_min_quant=0.05, floor_min_periods=100, floor_days=500): # Standard deviation will be nan for first 10 non nan values vol = pd.ewmstd(x, span=days, min_periods=min_periods) vol[vol < vol_abs_min] = vol_abs_min if vol_floor: # Find the rolling 5% quantile point to set as a minimum vol_min = pd.rolling_quantile( vol, floor_days, floor_min_quant, floor_min_periods) # set this to zero for the first value then propogate forward, ensures # we always have a value vol_min.set_value(vol_min.index[0], 0.0) vol_min = vol_min.ffill() # apply the vol floor vol_with_min = pd.concat([vol, vol_min], axis=1) vol_floored = vol_with_min.max(axis=1, skipna=False) else: vol_floored = vol return vol_floored
def rolling_quantile(x, width, quant): """Rolling quantile (0--1) with mirrored edges.""" x, wing = check_inputs(x, width) # Pad the edges of the original array with mirror copies signal = np.concatenate((x[wing-1::-1], x, x[:-wing-1:-1])) rolled = pd.rolling_quantile(signal, 2 * wing + 1, quant, center=True) return rolled[wing:-wing]
def extract_features_group(df, columns, win_size): df_mean = df.groupby('id')[columns].apply(pd.rolling_mean, win_size, min_periods=1) df_std = df.groupby('id')[columns].apply(pd.rolling_std, win_size, min_periods=1) df_std = df_std.fillna(0) df_median = df.groupby('id')[columns].apply(pd.rolling_median, win_size, min_periods=1) df_min = df.groupby('id')[columns].apply(pd.rolling_min, win_size, min_periods=1) df_max = df.groupby('id')[columns].apply(pd.rolling_max, win_size, min_periods=1) df_quantile = df.groupby('id')[columns].apply( lambda x: pd.rolling_quantile(x, win_size, 0.9, min_periods=1)) df_rms = df.groupby('id')[columns].apply(pd.rolling_apply, win_size, lambda x: RMS(x), min_periods=1) df_energy = df.groupby('id')[columns].apply(pd.rolling_apply, win_size, lambda x: Energy(x), min_periods=1) df_features = pd.concat([ df[columns], df_mean, df_std, df_median, df_max, df_min, df_quantile, df_rms, df_energy ], axis=1).dropna() features = np.array(df_features) return features
def robust_vol_calc(x, days=35, min_periods=10, vol_abs_min=0.0000000001, vol_floor=True, floor_min_quant=0.05, floor_min_periods=100, floor_days=500): # Standard deviation will be nan for first 10 non nan values vol = pd.ewmstd(x, span=days, min_periods=min_periods) vol[vol < vol_abs_min] = vol_abs_min if vol_floor: # Find the rolling 5% quantile point to set as a minimum vol_min = pd.rolling_quantile(vol, floor_days, floor_min_quant, floor_min_periods) # set this to zero for the first value then propogate forward, ensures # we always have a value vol_min.set_value(vol_min.index[0], 0.0) vol_min = vol_min.ffill() # apply the vol floor vol_with_min = pd.concat([vol, vol_min], axis=1) vol_floored = vol_with_min.max(axis=1, skipna=False) else: vol_floored = vol return vol_floored
def rolling_functions_tests(p, d): # Old-fashioned rolling API assert_eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3)) assert_eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3)) assert_eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3)) assert_eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3)) assert_eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3)) assert_eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3)) assert_eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3)) assert_eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3)) # see note around test_rolling_dataframe for logic concerning precision assert_eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3), check_less_precise=True) assert_eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3), check_less_precise=True) assert_eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5)) assert_eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad)) with ignoring(ImportError): assert_eq(pd.rolling_window(p, 3, 'boxcar'), dd.rolling_window(d, 3, 'boxcar')) # Test with edge-case window sizes assert_eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0)) assert_eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1)) # Test with kwargs assert_eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3, min_periods=3))
def robust_vol_calc(x, days=35, min_periods=10, vol_abs_min=0.0000000001, vol_floor=True, floor_min_quant=0.05, floor_min_periods=100, floor_days=500): """ Robust exponential volatility calculation, assuming daily series of prices We apply an absolute minimum level of vol (absmin); and a volfloor based on lowest vol over recent history :param x: data :type x: Tx1 pd.Series :param days: Number of days in lookback (*default* 35) :type days: int :param min_periods: The minimum number of observations (*default* 10) :type min_periods: int :param vol_abs_min: The size of absolute minimum (*default* =0.0000000001) 0.0= not used :type absmin: float or None :param vol_floor Apply a floor to volatility (*default* True) :type vol_floor: bool :param floor_min_quant: The quantile to use for volatility floor (eg 0.05 means we use 5% vol) (*default 0.05) :type floor_min_quant: float :param floor_days: The lookback for calculating volatility floor, in days (*default* 500) :type floor_days: int :param floor_min_periods: Minimum observations for floor - until reached floor is zero (*default* 100) :type floor_min_periods: int :returns: pd.DataFrame -- volatility measure """ # Standard deviation will be nan for first 10 non nan values vol = pd.ewmstd(x, span=days, min_periods=min_periods) vol[vol < vol_abs_min] = vol_abs_min if vol_floor: # Find the rolling 5% quantile point to set as a minimum vol_min = pd.rolling_quantile(vol, floor_days, floor_min_quant, floor_min_periods) # set this to zero for the first value then propogate forward, ensures # we always have a value vol_min.set_value(vol_min.index[0], 0.0) vol_min = vol_min.ffill() # apply the vol floor vol_with_min = pd.concat([vol, vol_min], axis=1) vol_floored = vol_with_min.max(axis=1, skipna=False) else: vol_floored = vol return vol_floored
def rolling_functions_tests(p, d): # Old-fashioned rolling API assert_eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3)) assert_eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3)) assert_eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3)) assert_eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3)) assert_eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3)) assert_eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3)) assert_eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3)) assert_eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3)) # see note around test_rolling_dataframe for logic concerning precision assert_eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3), check_less_precise=True) assert_eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3), check_less_precise=True) assert_eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5)) assert_eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad)) assert_eq(pd.rolling_window(p, 3, win_type='boxcar'), dd.rolling_window(d, 3, win_type='boxcar')) # Test with edge-case window sizes assert_eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0)) assert_eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1)) # Test with kwargs assert_eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3, min_periods=3))
def transform_DF(df,F2scores,stock_name="",Quantiles=None): #MUST HAVE df["ooRelRet(nextDay)"] = df["ooRelRet"].shift(-1) #for return calculation df["ooRawRet(nextDay)"] = df["ooRawRet"].shift(-1) if 'barraBeta' in df.columns: df['barraBeta'] = df['barraBeta'].fillna(method='ffill') df['barraBeta'] = df['barraBeta'].fillna(method='bfill') else: print 'barraBeta not available in this set' for f2score in F2scores: df[f2score][df[f2score] <-15] = -15 df[f2score][df[f2score] > 15] = 15 df[f2score+"_Scaled"] = df[f2score]*0.01 #for graphing purposes only to combine it on the same graph with RelRet #I tried rolling quantiles of 220Days, but they were ending up with F2 of 2.2 as a shorting signal. #I think 2years at least should be used. df[f2score+"Qlower"] = pd.rolling_quantile(df[f2score],window=400,quantile=0.095) df[f2score+"Qupper"] = pd.rolling_quantile(df[f2score],window=400,quantile=0.905) df[f2score+"Qlower"] = df[f2score+"Qlower"].fillna(method='bfill') df[f2score+"Qupper"] = df[f2score+"Qupper"].fillna(method='bfill') """some stats that are necessary for graphing and stat analysis""" for var in ['ooRelRet','ccRelRet',]: #'ooRawRet' df[var+"(Cum)"] = df[var].cumsum() for days in [1,2,3,5,8,10,20]: df[var+"("+str(days)+"D avg)"] = pd.rolling_mean(df[var],days) df[var+"(next"+str(days)+"D avg)"] = pd.rolling_mean(df[var],days).shift(-(days+1))#negative shift allows to look into the future, used for graphs, but not for trading. df[var+"("+str(days)+"D sum)"] = pd.rolling_sum(df[var],days) for days in [3,10]: #this vector must be a subset of the vector above var = "ooRelRet" df[var+"("+str(days)+"D UWM)"] = pd.rolling_mean(df["ooRelRet("+str(days)+"D avg)"],10) for var in ['ccRelRet','ooPoolRet','ccPoolRet']: #'ooRawRet' for days in [5,8,10,15,20]: df["avg"] = pd.rolling_mean(df[var],days) df["std"] = pd.rolling_std(df[var],days) df[var+"_E("+str(days)+"D)"] = df['avg']/df['std'] df = df.drop(["avg","std"],axis = 1) #UTILITIES.dump_data(df,stock_name,t_fn="t_"+stock_name+"_transformed.csv") return df
def create_data_frame_1(facility, facility_data_db): '''Creates data frames of facility data''' facility_list = [[f.date, f.location, f.new_orders, f.new_lines, f.new_units, f.new_dollars, \ f.sched_orders, f.sched_lines, f.sched_units, f.sched_dollars, \ f.unsched_orders, f.unsched_lines, f.unsched_units, f.unsched_dollars, \ f.ship_orders, f.ship_lines, f.ship_units, f.ship_dollars, \ f.susp_orders, f.susp_lines, f.susp_units, f.susp_dollars, \ f.old_orders, f.old_lines, f.old_units, f.old_dollars, \ f.fut_orders, f.fut_lines, f.fut_units, f.fut_dollars, \ f.hold_orders, f.hold_lines, f.hold_units, f.hold_dollars] for f in facility_data_db.values() if f.location == facility] df = pd.DataFrame(facility_list, columns=['date', 'location', 'new_orders', 'new_lines', 'new_units', 'new_dollars', 'sched_orders', 'sched_lines', 'sched_units', 'sched_dollars', 'unsched_orders', 'unsched_lines', 'unsched_units', 'unsched_dollars', 'ship_orders', 'ship_lines', 'ship_units', 'ship_dollars', 'susp_orders', 'susp_lines', 'susp_units', 'susp_dollars', 'old_orders', 'old_lines', 'old_units', 'old_dollars', 'fut_orders', 'fut_lines', 'fut_units', 'fut_dollars', 'hold_orders', 'hold_lines', 'hold_units', 'hold_dollars']) ## cast index to datetime; not automatically a datetime for some reason df.index = pd.to_datetime(df.date) df = df.sort(['date']) df['year'] = df["date"].apply(lambda x: datetime.date.isocalendar(x)[0]) df['week_num'] = df["date"].apply(lambda x: datetime.date.isocalendar(x)[1]) df['week_day'] = df["date"].apply(lambda x: datetime.date.isocalendar(x)[2]) df['day_of_year'] = df['date'].apply(lambda d: d.toordinal() - datetime.date(d.year, 1, 1).toordinal() + 1) df['ship_MA10_orders'] = pd.rolling_quantile(df['ship_orders'], 5, 0.75) df['ship_MA10_lines'] = pd.rolling_quantile(df['ship_lines'], 5, 0.75) df['ship_MA10_units'] = pd.rolling_quantile(df['ship_units'], 5, 0.75) df['ship_MA10_dollars'] = pd.rolling_quantile(df['ship_dollars'],5, 0.75) df['in_process_orders'] = df['sched_orders'] + df['unsched_orders'] + df['old_orders'] + df['fut_orders'] + df['hold_orders'] df['in_process_lines'] = df['sched_lines'] + df['unsched_lines'] + df['old_lines'] + df['fut_lines'] + df['hold_lines'] df['in_process_units'] = df['sched_units'] + df['unsched_units'] + df['old_units'] + df['fut_units'] + df['hold_units'] df['in_process_dollars'] = df['sched_dollars'] + df['unsched_dollars'] + df['old_dollars'] + df['fut_dollars'] + df['hold_dollars'] df['backlog_orders'] = df['in_process_orders'].div(df['ship_MA10_orders']) df['backlog_lines'] = df['in_process_lines'].div(df['ship_MA10_lines']) df['backlog_units'] = df['in_process_units'].div(df['ship_MA10_units']) df['backlog_dollars'] = df['in_process_dollars'].div(df['ship_MA10_dollars']) df['units_per_line'] = pd.rolling_mean(df.new_units, 10) / pd.rolling_mean(df.new_lines, 10) df['lines_per_order'] = pd.rolling_mean(df.new_lines, 10) / pd.rolling_mean(df.new_orders, 10) df['dollars_per_unit'] = pd.rolling_mean(df.new_dollars, 10) / pd.rolling_mean(df.new_units, 10) * 1000 df['dollars_per_order'] = pd.rolling_mean(df.new_dollars, 10) / pd.rolling_mean(df.new_orders, 10) * 1000 return df
def rollingStats(self, selectCol = [], splitCol=None, sepCol=None, startTime=None, endTime=None, window=60, quantile=0.1, freq='10s', min_periods=5 ): df = self.dfSetup() ## Selects a list of columns to use and splits a column into single type if it contains more than one # eg. if a file contains multiple sensor readings if (len(selectCol) > 0): dfSub = df[selectCol] else: dfSub = df if (splitCol and sepCol): dfSub = dfSub[dfSub[splitCol] == sepCol] ## Converts datetime column to datatime object index, then use it to create time slices # Time format '2015-10-17 09:00:00' May use the dfOther to use other data frames if (startTime and endTime): dfSub = dfSub[ startTime : endTime ] else: dfSub = dfSub if (splitCol): dfSub = dfSub.drop(splitCol, axis=1) # Remove columns used to split entries valueName = dfSub.columns.values[0] outList = [] counts = pd.rolling_count(dfSub,window,freq=freq).rename(columns = {valueName:'rolling_counts'}) outList.append(counts) means = pd.rolling_mean(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_mean'}) outList.append(means) rms = np.sqrt(pd.rolling_mean(dfSub**2, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_rms'}) ) outList.append(rms) medians = pd.rolling_median(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_median'}) outList.append(medians) stds = pd.rolling_std(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_std'}) outList.append(stds) mins = pd.rolling_min(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_min'}) outList.append(mins) maxs = pd.rolling_max(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_max'}) outList.append(maxs) quants = pd.rolling_quantile(dfSub, window, quantile, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_quantile'}) outList.append(quants) dfOut = pd.concat(outList, axis=1) return dfOut
def robust_vol_calc(x, days=35, min_periods=10, vol_abs_min=0.0000000001, vol_floor=True, floor_min_quant=0.05, floor_min_periods=100, floor_days=500): """ Robust exponential volatility calculation, assuming daily series of prices We apply an absolute minimum level of vol (absmin); and a volfloor based on lowest vol over recent history :param x: data :type x: Tx1 pd.DataFrame :param days: Number of days in lookback (*default* 35) :type days: int :param min_periods: The minimum number of observations (*default* 10) :type min_periods: int :param vol_abs_min: The size of absolute minimum (*default* =0.0000000001) 0.0= not used :type absmin: float or None :param vol_floor Apply a floor to volatility (*default* True) :type vol_floor: bool :param floor_min_quant: The quantile to use for volatility floor (eg 0.05 means we use 5% vol) (*default 0.05) :type floor_min_quant: float :param floor_days: The lookback for calculating volatility floor, in days (*default* 500) :type floor_days: int :param floor_min_periods: Minimum observations for floor - until reached floor is zero (*default* 100) :type floor_min_periods: int :returns: pd.DataFrame -- volatility measure """ # Standard deviation will be nan for first 10 non nan values vol = pd.ewmstd(x, span=days, min_periods=min_periods) vol[vol < vol_abs_min] = vol_abs_min if vol_floor: # Find the rolling 5% quantile point to set as a minimum vol_min = pd.rolling_quantile( vol, floor_days, floor_min_quant, floor_min_periods) # set this to zero for the first value then propogate forward, ensures # we always have a value vol_min.set_value(vol_min.index[0], vol_min.columns[0], 0.0) vol_min = vol_min.ffill() # apply the vol floor vol_with_min = pd.concat([vol, vol_min], axis=1) vol_floored = vol_with_min.max(axis=1, skipna=False).to_frame() else: vol_floored = vol vol_floored.columns = ["vol"] return vol_floored
def outliers(self, **kwargs): sigmas = kwargs.get('sigma', None) print(kwargs) outliers = {} if sigmas: sigmas = sigmas * 3 if len(sigmas) == 1 else sigmas inds = [] for sigma, col in zip(sigmas, self.columns): ind = self.df['%s_sigma' % col] > sigma inds.append(ind) ind = np.logical_or.reduce(np.array(inds)) # ind = np.logical_or(inds[0], np.logical_or(inds[1], inds[2])) sigma_error_index = self.df.index[ind] # self.df.ix[sigma_error_index] = np.nan outliers['sigma'] = sigma_error_index iqr_factor = kwargs.get('iqr_factor', None) if iqr_factor: window = kwargs['iqr_window'] inds = [] results = self.fit(**kwargs) residual_df = results['residual'] residual_df.resample(self.freq) self.interpolate(residual_df) for col in self.columns: residual = residual_df[col] median = pd.rolling_median(residual, window) q75 = pd.rolling_quantile(residual, window, 0.75) q25 = pd.rolling_quantile(residual, window, 0.25) qrange = iqr_factor * (q75 - q25) low = median - qrange high = median + qrange ind = np.logical_or(residual.values < low.values, residual.values > high.values) ind2 = (residual - residual.mean()).abs() > iqr_factor * residual.std() ind = np.logical_or(ind, ind2) inds.append(ind) # ind = np.logical_or(inds[0], np.logical_or(inds[1], inds[2])) ind = np.logical_or.reduce(np.array(inds)) iqr_error_index = residual_df.index[ind] outliers['iqr'] = iqr_error_index return outliers
def rolling_quantile(x, width, quantile): """Rolling quantile (0--1) with mirrored edges.""" x, wing = check_inputs(x, width) # Pad the edges of the original array with mirror copies signal = np.concatenate((x[wing-1::-1], x, x[:-wing-1:-1])) with warnings.catch_warnings(): # NB: in pandas 0.18+ this function is deprecated warnings.simplefilter("ignore", FutureWarning) rolled = pd.rolling_quantile(signal, 2 * wing + 1, quantile, center=True) return rolled[wing:-wing]
def get_quantile_outliers(self, file_name=TEST_FILE, quantile=0.05, rolling_window_size=DEFAULT_ROLLING_WINDOW_SIZE, chunksize=DEFAULT_CHUNK_SIZE): """Computes quantile-based outliers in the input data sed :param file_name: Input data set file name :type file_name: str :param quantile: input quantile. Default value: 0.05 (5%) :type quantile: float :param rolling_window_size: Rolling window size :type rolling_window_size: int :param chunksize: Input file reading chunk size :type chunksize: int :return: A tuple of two numpy arrays containing low/high end outliers :rtype: tuple """ anom_min = pd.DataFrame() anom_max = pd.DataFrame() data_anom_max = pd.DataFrame() data_anom_min = pd.DataFrame() for chunk in pd.read_csv(file_name, chunksize=chunksize, date_parser=True): cpu_usage_min = pd.rolling_quantile(chunk.cpu_usage, window=rolling_window_size, quantile=quantile) cpu_usage_max = pd.rolling_quantile(chunk.cpu_usage, window=rolling_window_size, quantile=1 - quantile) # print(cpu_usage_max) anom_min = chunk.loc[chunk.cpu_usage < cpu_usage_min, ['time', 'cpu_usage']] anom_max = chunk.loc[chunk.cpu_usage > cpu_usage_max, ['time', 'cpu_usage']] data_anom_min = data_anom_min.append(anom_min).dropna() data_anom_max = data_anom_max.append(anom_max).dropna() print(data_anom_max) return data_anom_max, data_anom_min
def rolling_quantile(x, width, quantile): """Rolling quantile (0--1) with mirrored edges.""" x, wing = check_inputs(x, width) # Pad the edges of the original array with mirror copies signal = np.concatenate((x[wing - 1::-1], x, x[:-wing - 1:-1])) with warnings.catch_warnings(): # NB: in pandas 0.18+ this function is deprecated warnings.simplefilter("ignore", FutureWarning) rolled = pd.rolling_quantile(signal, 2 * wing + 1, quantile, center=True) return rolled[wing:-wing]
def extract_features_group(df, columns, win_size): #df_mean = df.groupby('id')[columns].apply(pd.rolling_mean, win_size, min_periods=1) df_mean = df.groupby('id')[columns].rolling( window=win_size, min_periods=1, center=False).mean().reset_index().drop(['id', 'level_1'], axis=1) #df_std = df.groupby('id')[columns].apply(pd.rolling_std, win_size, min_periods=1) df_std = df.groupby('id')[columns].rolling( window=win_size, min_periods=1, center=False).std().reset_index().drop(['id', 'level_1'], axis=1) df_std = df_std.fillna(0) #df_median = df.groupby('id')[columns].apply(pd.rolling_median, win_size, min_periods=1) df_median = df.groupby('id')[columns].rolling( window=win_size, min_periods=1, center=False).median().reset_index().drop(['id', 'level_1'], axis=1) #df_min = df.groupby('id')[columns].apply(pd.rolling_min, win_size, min_periods=1) df_min = df.groupby('id')[columns].rolling( window=win_size, min_periods=1, center=False).min().reset_index().drop(['id', 'level_1'], axis=1) #df_max = df.groupby('id')[columns].apply(pd.rolling_max, win_size, min_periods=1) df_max = df.groupby('id')[columns].rolling( window=win_size, min_periods=1, center=False).max().reset_index().drop(['id', 'level_1'], axis=1) df_quantile = df.groupby('id')[columns].apply( lambda x: pd.rolling_quantile(x, win_size, 0.9, min_periods=1)) df_rms = df.groupby('id')[columns].apply(pd.rolling_apply, win_size, lambda x: RMS(x), min_periods=1) #df_rms = df.groupby('id')[columns].rolling(window=win_size,center=False,min_periods=1).apply(func= lambda x:RMS(x)) df_energy = df.groupby('id')[columns].apply(pd.rolling_apply, win_size, lambda x: Energy(x), min_periods=1) df_features = pd.concat([ df[columns], df_mean, df_std, df_median, df_max, df_min, df_quantile, df_rms, df_energy ], axis=1).dropna() features = np.array(df_features) return features
def robust_vol_calc(x, days=35, min_periods=10, vol_abs_min=0.0000000001, vol_floor=True, floor_min_quant=0.05, floor_min_periods=100, floor_days=500): vol = pd.ewmstd(x, span=days, min_periods=min_periods) vol[vol < vol_abs_min] = vol_abs_min if vol_floor: vol_min = pd.rolling_quantile( vol, floor_days, floor_min_quant, floor_min_periods) vol_min.set_value(vol_min.index[0], 0.0) vol_min = vol_min.ffill() vol_with_min = pd.concat([vol, vol_min], axis=1) vol_floored = vol_with_min.max(axis=1, skipna=False) else: vol_floored = vol return vol_floored
def extract_features(df, columns, win_size): df_mean = pd.rolling_mean(df[columns], win_size, min_periods=1) df_std = pd.rolling_std(df[columns], win_size, min_periods=1) df_std = df_std.fillna(0) df_median = pd.rolling_median(df[columns], win_size, min_periods=1) df_min = pd.rolling_min(df[columns], win_size, min_periods=1) df_max = pd.rolling_max(df[columns], win_size, min_periods=1) df_quantile = pd.rolling_quantile(df[columns], win_size, 0.9) df_rms = pd.rolling_apply(df[columns], win_size, lambda x: RMS(x)) df_energy = pd.rolling_apply(df[columns], win_size, lambda x: Energy(x)) df_features = pd.concat([ df[columns], df_mean, df_std, df_median, df_max, df_min, df_quantile, df_rms, df_energy ], axis=1).dropna() features = np.array(df_features) return features
def rolling_tests(p, d): eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3)) eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3)) eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3)) eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3)) eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3)) eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3)) eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3)) eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3)) eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3)) eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3)) eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5)) mad = lambda x: np.fabs(x - x.mean()).mean() eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad)) eq(pd.rolling_window(p, 3, 'boxcar'), dd.rolling_window(d, 3, 'boxcar')) # Test with edge-case window sizes eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0)) eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1)) # Test with kwargs eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3, min_periods=3))
def rolling_functions_tests(p, d): # Old-fashioned rolling API eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3)) eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3)) eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3)) eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3)) eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3)) eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3)) eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3)) eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3)) eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3)) eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3)) eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5)) eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad)) with ignoring(ImportError): eq(pd.rolling_window(p, 3, 'boxcar'), dd.rolling_window(d, 3, 'boxcar')) # Test with edge-case window sizes eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0)) eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1)) # Test with kwargs eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3, min_periods=3))
def rolling_functions_tests(p, d): # Old-fashioned rolling API eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3)) eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3)) eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3)) eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3)) eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3)) eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3)) eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3)) eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3)) eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3)) eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3)) eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5)) eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad)) with ignoring(ImportError): eq(pd.rolling_window(p, 3, "boxcar"), dd.rolling_window(d, 3, "boxcar")) # Test with edge-case window sizes eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0)) eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1)) # Test with kwargs eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3, min_periods=3))
def calc_HS_VaR(ret_df, window=504, min_periods=None, est_prob=0.01, PV=1): ''' Calculate Historial Simulation (HS) Value-at-Risk (VaR) Parameters ---------- ret_df : DataFrame Asset or Portfolio returns window : int, optional Window used to compute VaR min_periods: int, optional Minimum number of periods in the window to compute VaR est_prob : float, optional VaR estimation probability (defaults to 1%) PV : float, optional Portfolio value or notional (defaults to 1) Returns ------- HS_VaR_df : DataFrame Historical Simulation Value-at-Risk ''' if window < 0: raise ValueError('%d is not a valid window size' % window) if est_prob < 0 or est_prob > 1: raise ValueError('%f is not a valid estimation probability' % est_prob) if PV < 0: raise ValueError('%f is not a valid portfolio value' % PV) HS_VaR_df = pd.rolling_quantile(ret_df, window, est_prob, min_periods=min_periods) * PV col = 'HS VaR (' + str(round(window/252)) + 'Y window, ' + str(est_prob*100) + '% probability' + ')' HS_VaR_df = HS_VaR_df.rename(columns={'Portfolio':col}) return HS_VaR_df
def rolling_tests(p, d): eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3)) eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3)) eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3)) eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3)) eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3)) eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3)) eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3)) eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3)) eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3)) eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3)) eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5)) mad = lambda x: np.fabs(x - x.mean()).mean() eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad)) with ignoring(ImportError): eq(pd.rolling_window(p, 3, 'boxcar'), dd.rolling_window(d, 3, 'boxcar')) # Test with edge-case window sizes eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0)) eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1)) # Test with kwargs eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3, min_periods=3))
def ts_quantileFn(arr, q, min_periods, max_periods): if not (max_periods): max_periods = len(arr) return pd.rolling_quantile(arr, max_periods, min_periods=min_periods, quantile=q)
def VaR(symbol='AAPL', notl=None, conf=0.95, dist=None, _d1=None, _d2=None, volwindow=50, varwindow=250): # Retrieve the data from Internet # Choose a time period d1 = _d1 if _d1 else datetime.datetime(2001, 1, 1) d2 = _d2 if _d2 else datetime.datetime(2012, 1, 1) #get the tickers price = DataReader(symbol, "yahoo", d1, d2)['Adj Close'] price = price.asfreq('B').fillna(method='pad') ret = price.pct_change() #choose the quantile quantile = 1 - conf import pdb pdb.set_trace() #simple VaR using all the data # VaR on average accross all the data unnormedquantile = pd.expanding_quantile(ret, quantile) # similar one using a rolling window # VaR only calculated over the varwindow, rolling unnormedquantileR = pd.rolling_quantile(ret, varwindow, quantile) #we can also normalize the returns by the vol vol = pd.rolling_std(ret, volwindow) * np.sqrt(256) unitvol = ret / vol #and get the expanding or rolling quantiles # Same calcs as above except normalized so show VaR in # standard deviations instead of expected returns Var = pd.expanding_quantile(unitvol, quantile) VarR = pd.rolling_quantile(unitvol, varwindow, quantile) normedquantile = Var * vol normedquantileR = VarR * vol ret2 = ret.shift(-1) courbe = pd.DataFrame({ 'returns': ret2, 'quantiles': unnormedquantile, 'Rolling quantiles': unnormedquantileR, 'Normed quantiles': normedquantile, 'Rolling Normed quantiles': normedquantileR, }) courbe['nqBreak'] = np.sign(ret2 - normedquantile) / (-2) + 0.5 courbe['nqBreakR'] = np.sign(ret2 - normedquantileR) / (-2) + 0.5 courbe['UnqBreak'] = np.sign(ret2 - unnormedquantile) / (-2) + 0.5 courbe['UnqBreakR'] = np.sign(ret2 - unnormedquantileR) / (-2) + 0.5 nbdays = price.count() print('Number of returns worse than the VaR') print('Ideal Var : ', (quantile) * nbdays) print('Simple VaR : ', np.sum(courbe['UnqBreak'])) print('Normalized VaR : ', np.sum(courbe['nqBreak'])) print('---------------------------') print('Ideal Rolling Var : ', (quantile) * (nbdays - varwindow)) print('Rolling VaR : ', np.sum(courbe['UnqBreakR'])) print('Rolling Normalized VaR : ', np.sum(courbe['nqBreakR']))
family_children['DIFF'] = (family_children['FIRST_CHILD'] - family_children['MARR_DATE']).dt.days # from 1790 to 1856 # family_children = family_children[family_children.MARR_DATE > datetime.date(1790, 1, 1)] # family_children = family_children[family_children.MARR_DATE < datetime.date(1856, 1, 1)] # FIXME: simplify final = family_children[family_children.columns] final = final.set_index('FIRST_CHILD') final = final.sort_index() #final = final[final.DIFF < 1095] nine_months = 274 q10 = pd.rolling_quantile(final['DIFF'], 40, 0.1) q50 = pd.rolling_quantile(final['DIFF'], 40, 0.5) q90 = pd.rolling_quantile(final['DIFF'], 40, 0.9) above = final[final.DIFF > nine_months] bellow = final[final.DIFF <= nine_months] #final.groupby('FATHER_LINE').size().sort_values() group = final[final.FATHER_LINE == 73863] # %matplotlib inline sbn.set_style('ticks') plt.figure() # plt.plot(above.index, above.DIFF, marker='o', color='0.75', linestyle='') # plt.plot(bellow.index, bellow.DIFF, marker='o', color='0.5',linestyle='')
# # full_df['r']=np.log(full_df['open']) # full_df['r']=full_df['r'].diff() full_df['c_ma']=pd.rolling_mean(full_df['close'],5) full_df['o_ma']=pd.rolling_mean(full_df['open'],5) full_df['cftc_ma']=pd.rolling_mean(full_df['cftc'],5) full_df['c_ma_diff']=full_df['c_ma'].diff() full_df['o_ma_diff']=full_df['o_ma'].diff() full_df['cftc_ma_diff']=full_df['cftc_ma'].diff() full_df['c_up_thr']=pd.rolling_quantile(full_df['c_ma_diff'],100,0.6) full_df['c_low_thr']=pd.rolling_quantile(full_df['c_ma_diff'],100,0.4) full_df['o_up_thr']=pd.rolling_quantile(full_df['o_ma_diff'],100,0.6) full_df['o_low_thr']=pd.rolling_quantile(full_df['o_ma_diff'],100,0.4) full_df['cftc_up_thr']=pd.rolling_quantile(full_df['cftc_ma_diff'],100,0.6) full_df['cftc_low_thr']=pd.rolling_quantile(full_df['cftc_ma_diff'],100,0.4) def cc_2(x,u,l): if x>=u: return 1 elif x<=l: return -1 else: return 0 full_df['close_ma_sig']=map(cc_2,full_df['c_ma_diff'],full_df['c_up_thr'],full_df['c_low_thr'])
large_data=False remove_baseline=False window_baseline=700 quantile_baseline=.1 #%% reload=0 filename='movies/demoMovie.tif' if not reload: t = tifffile.TiffFile(filename) Y = t.asarray().astype(dtype=np.float32) Y = np.transpose(Y,(1,2,0)) d1,d2,T=Y.shape Yr=np.reshape(Y,(d1*d2,T),order='F') if remove_baseline: Yr_begin=Yr[:,:99].copy() Yr=Yr-pd.rolling_quantile(Yr.T,window_baseline,quantile_baseline,min_periods=100,center=True).T Yr[:,:99]=Yr_begin-np.percentile(Yr_begin,quantile_baseline*100,axis=1)[:,None] Y=np.reshape(Yr,(d1,d2,T),order='F') np.save('Y',Y) np.save('Yr',Yr) #% if caching: Y=np.load('Y.npy',mmap_mode='r') Yr=np.load('Yr.npy',mmap_mode='r') else: Y=np.load('Y.npy') Yr=np.load('Yr.npy') d1,d2,T=Y.shape #%% if not large_data:
def evaluate(self, table): expr = self.expr val = None if expr is not None: val = expr.evaluate(table) return pd.rolling_quantile(val, self.window)
df = pd.read_csv( "../exp/src/main/java/history/XAUUSDm5", header=None, names=[ "time", "bo", "bc", "bh", "bl", "ao", "ac", "ah", "al", "vol"], index_col=0, parse_dates=True)[::-1] df["spread"] = df["ac"] - df["bc"] df["ind_h"] = pd.rolling_max(df.ah, window=50).shift(1) df["ind_h_prev"] = df.ind_h.shift(1) df["ah_prev"] = df.ah.shift(1) df["drawdown"] = 1 - pd.rolling_min(df.al, 20).shift(-20) / df.ac df.drawdown = df.drawdown.shift(20) df["sl_ratio"] = pd.rolling_quantile(df["drawdown"], 250, 0.9) df["drawup"] = pd.rolling_max(df.ah, 20).shift(-20) / df.ac - 1 df = df["2016-03-01": "2016-03-02"] #fig = plt.figure() #dates = df.index #ax = fig.add_subplot(1,1,1) #ax.plot(dates, df.ac, dates, df.ind_h) #ax.xaxis.set_major_locator(HourLocator(byhour=range(24), interval=4)) #ax.xaxis.set_major_formatter(DateFormatter("%Y%m%d %H")) #ax.xaxis_date() #plt.setp(plt.gca().get_xticklabels(), rotation=90, horizontalalignment='right') #ax.legend(['close', 'high']) #plt.show() df['tp_ratio'] = pd.rolling_quantile(df.drawup, 250, 0.7)
def sliding_median_iqr(neighbors, random=None, compute_random=0, window=1000, p0=None): """ Compute sliding median of spearmanr and size, interquartile range and 95% CI of spearmanr of randomly paired genes Parameters ---------- neighbors: neighboring gene pairs dataframe window: size of window for sliding median Returns ------- rolling_median: sliding median of spearmanr and size with IQR for spearmanr median and 95% confidence interval of median from random pairs """ #load dataframe if not provided yet if isinstance(neighbors, basestring): neighbors = pd.read_csv(neighbors) if compute_random and isinstance(random, basestring): random = pd.read_csv(random) # sort by size to do sliding window with increasing intergenic distance # nans cause error in sliding median neighbors = neighbors.sort('size').dropna() print 'computing sliding median...' # compute rolling medians. 1000 looks good, less is unnecesserily heavy and noisy. rolling_median_spearmanr = pd.rolling_median(neighbors.spearmanr, window) print 'computing IQR...' # compute interquartile range (IQR). Top 75% and bottom 25%. rolling_spearmanr_q1 = - pd.rolling_quantile(neighbors.spearmanr, window, 0.25) + \ rolling_median_spearmanr rolling_spearmanr_q3 = pd.rolling_quantile(neighbors.spearmanr, window, 0.75) - \ rolling_median_spearmanr rolling_median_size = pd.rolling_median(neighbors['size'], window) / 1000 # put it all together rolling_median_s = pd.DataFrame({ 'spearmanr': rolling_median_spearmanr, 'size': rolling_median_size, 'q1': rolling_spearmanr_q1, 'q3': rolling_spearmanr_q3 }) # drop all nans from sliding median (first 1000 because of window) rolling_median_s = rolling_median_s.dropna() # reindex is necessary rolling_median_s.index = np.arange(len(rolling_median_s)) if compute_random: print 'computing random pairs median CI' # compute 95% confidence interval of median in random pairs ci_median = bs.ci(random.spearmanr.dropna().loc[:20000], np.median) rolling_median_s['random_lci'] = ci_median[0] rolling_median_s['random_hci'] = ci_median[1] print 'fitting to exp decay...' popt_s, pcov_s = curve_fit(exp_decay, rolling_median_s['size'], rolling_median_s.spearmanr, p0=p0) rolling_median_s['popt1'] = popt_s[0] rolling_median_s['popt2'] = popt_s[1] rolling_median_s['popt3'] = popt_s[2] print 'done' return rolling_median_s
df['prevma'] = df.ma.shift(1) df['std'] = pd.rolling_std(df.close, 20) df['chg'] = df.close.pct_change() df['chg_std'] = pd.rolling_std(df.chg, 20) df['range'] = (df.high - df.low) / df.close.shift(1) df['range_std'] = pd.rolling_std(df.range, 20) df.range_std.hist(bins=20) df['profit'] = pd.rolling_sum(df.chg, 5).shift(-5) x_series = pd.Series(np.arange(len(df.index)), index=df.index) df['slope'] = pd.ols(y=df.close, x=x_series, window=10).beta['x'] df.index = pd.to_datetime(df.index, format='%y%m%d') df['deviation'] = (df['close'] - df['ma']) / df['std'] df['prevdev'] = df.deviation.shift(1) df['min10'] = pd.rolling_min(df['low'], 10).shift(-10) df['max10'] = pd.rolling_max(df['high'], 10).shift(-10) df['factor'] = pd.rolling_quantile(df.deviation, 250, 0.85) df['upperb'] = 1.5 * df['std'] + df['ma'] df['drawdown'] = (df.close - df.min10) / df.close df['drawup'] = (df.max10 - df.close) / df.close df['max_drawdown'] = pd.rolling_apply(df.close, 10, get_max_drawdown).shift(-10) / df.close df = df['2012-01-01': '2014-01-01'] fig = plt.figure() dates = df.index ax = fig.add_subplot(2,1,1) ax.plot(dates, df.close) ax.xaxis.set_major_locator(WeekdayLocator(byweekday=MO, interval=2)) ax.xaxis.set_major_formatter(DateFormatter("%Y%m%d")) ax.xaxis_date() plt.setp(plt.gca().get_xticklabels(), rotation=90, horizontalalignment='right') ax2 = fig.add_subplot(2,1,2)
price = price.asfreq('B').fillna(method='pad') ret = price.pct_change() #choose the quantile quantile = 0.05 #the vol window volwindow = 50 #and the Var window for rolling varwindow = 250 #simple VaR using all the data unnormedquantile = pd.expanding_quantile(ret, quantile) #similar one using a rolling window unnormedquantileR = pd.rolling_quantile(ret, varwindow, quantile) #we can also normalize the returns by the vol vol = pd.rolling_std(ret, volwindow) * np.sqrt(256) unitvol = ret / vol #and get the expanding or rolling quantiles Var = pd.expanding_quantile(unitvol, quantile) VarR = pd.rolling_quantile(unitvol, varwindow, quantile) normedquantile = Var * vol normedquantileR = VarR * vol ret2 = ret.shift(-1) courbe = pd.DataFrame({
points = points[points.Feeder == file_num].Point DAY = pd.Timedelta(days=1) points_sub = points.loc[(points.str.contains(r'\.PF\.')) & (points.str.contains(r'_PH')) & (points.str.contains(r'\.FDR\.')) & (-points.str.contains(r'BKR\.'))] df_sub = df.loc[df['Extended Id'].isin(points_sub)] print "Shape = " + str(df_sub.shape[0]) df_sub['Time'] = pd.to_datetime(df_sub['Time']) print "Finished to_datetime ..." for point_id in df_sub['Extended Id'].unique(): df_sub2 = df_sub.loc[df_sub['Extended Id'] == point_id] df_sub2 = df_sub2.sort_values(by='Time') print "Shape2 = " + str(df_sub2.shape[0]) window = _get_window(df_sub2, 24) if window == -1: continue anoms = df_sub2[(df_sub2.Value.abs() < 0.75) & (pd.rolling_quantile( df_sub2.Value.abs(), window, 0.01) > 0.8)].Time.tolist() anoms = np.array(anoms) anoms = [ e for e in anoms if df_sub2[(df_sub2.Time <= e) & (df_sub2.Time > (e - DAY))].shape[0] > 24 ] df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)] print "\n\nNUM ANOMS = " + str(len(anoms)) + "\n\n"
def rolling_loess_median(data, window=240, threshold=3): """ Flags anomalous flow observations beased on their deviation from expectec value. Parameters ---------- data : Raw data .csv file window : int Size of moving window (number of historical values to be used to classify most recent flow point) Default = 240 threshold : An anomaly will be classified if greater than Q75 + threshold * IQR or less than Q25 - threshold * IQR Default = 3 Returns ------- A dataframe containing detected anomalies: gage_id : Unique gauge identification date_time : Date and time the reading was taken in format year-month-day-hour-minute-second. Example: `2016-05-08T20:36:00Z` flow : Flow rate measured at the gauge in $m^3/s$ water_lev : Water level anomaly : Classification (Detected anamlies) """ # Load data # headers = ["gauge_id","date_time","flow","water_lev","del"] # df = pd.read_csv(data, names=headers) df = pd.read_csv(data) # Error handling # if data is None or (not isinstance(data,pd.DataFrame)): # raise TypeError("Input data must be a dataframe") if window <= 0: raise ValueError("Window size should be positive") if threshold <= 0: raise ValueError("threshold should be positive") # Arrange data by date df = df.sort_values('date_time') # Uncomment if you want to filter for a specific year # df['std_date'] = pd.to_datetime(df['date_time']) # df['year'] = df['std_date'].dt.year # df = df.loc[(df['year'] == 2016)] # Converting flow column into series series = pd.Series(df["flow"]) series = series.to_frame('flow') # Computing rolling median, quantiles and Inter Quartile Range (IQR) df['median'] = series.rolling(window).median() df['q25'] = pd.rolling_quantile(series, window, 0.25) df['q75'] = pd.rolling_quantile(series, window, 0.75) df['iq_range'] = df['q75'] - df['q25'] # Setting up boundaries of range (based on number of IQRs) df['b_high_upper'] = df['q75'] + threshold * df['iq_range'] df['b_high_lower'] = df['q25'] - threshold * df['iq_range'] df['b_med_upper'] = df['q75'] + threshold * df['iq_range'] df['b_med_lower'] = df['q25'] - threshold * df['iq_range'] # Classifying points as anomalies or not df['anomaly'] = np.where( (df['flow'] > df['b_high_upper']) | (df['flow'] < df['b_high_lower']), 1, 0) # If IQR range = 0, dont mark them as anomalies. mask = df['iq_range'] == 0 df.loc[mask, 'anomaly'] = 0 df_anomaly = df[['gage_id', 'date_time', 'flow', 'water_lev', 'anomaly']].loc[df['anomaly'] == 1] return df_anomaly
def rolling_quantiles(self, window=30, quantiles=[0.25, 0.75]): '''Plots rolling quantiles of volatility Parameters ---------- window : int Rolling window for which to calculate the estimator quantiles : [lower, upper] List of lower and upper quantiles for which to plot ''' if len(quantiles) != 2: raise ValueError('A two element list of quantiles is required, lower and upper') if quantiles[0] + quantiles[1] != 1.0: raise ValueError('The sum of the quantiles must equal 1.0') if quantiles[0] > quantiles[1]: raise ValueError('The lower quantiles (first element) must be less than the upper quantile (second element)') estimator = self._get_estimator(window) date = estimator.index top_q = pandas.rolling_quantile(estimator, window, quantiles[1]) median = pandas.rolling_median(estimator, window) bottom_q = pandas.rolling_quantile(estimator, window, quantiles[0]) realized = estimator last = estimator[-1] if self._type is "Skew" or self._type is "Kurtosis": f = lambda x: "%i" % round(x, 0) else: f = lambda x: "%i%%" % round(x*100, 0) ''' Figure args ''' fig = plt.figure(figsize=(8, 6)) left, width = 0.07, 0.65 bottom, height = 0.2, 0.7 bottom_h = left_h = left+width+0.02 rect_cones = [left, bottom, width, height] rect_box = [left_h, bottom, 0.17, height] cones = plt.axes(rect_cones) box = plt.axes(rect_box) ''' Cones plot args ''' # set the plots cones.plot(date, top_q, label=str(int(quantiles[1]*100)) + " Prctl") cones.plot(date, median, label="Median") cones.plot(date, bottom_q, label=str(int(quantiles[0]*100)) + " Prctl") cones.plot(date, realized, 'r-.', label="Realized") # set and format the y-axis labels locs = cones.get_yticks() cones.set_yticklabels(map(f, locs)) # turn on the grid cones.grid(True, axis='y', which='major', alpha=0.5) # set the title cones.set_title(self._type + ' (' + self._ticker + ', daily ' + self._start.strftime("%Y-%m-%d") + ' to ' + self._end.strftime("%Y-%m-%d") + ')') # set the legend cones.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=3) ''' Box plot args ''' # set the plots box.boxplot(realized, notch=1, sym='+') box.plot(1, last, color='r', marker='*', markeredgecolor='k') # set and format the y-axis labels locs = box.get_yticks() box.set_yticklabels(map(f, locs)) # move the y-axis ticks on the right side box.yaxis.tick_right() # turn on the grid box.grid(True, axis='y', which='major', alpha=0.5) return fig, plt
def sliding_median_iqr(neighbors, random=None, compute_random=0, window=1000, p0=None): """ Compute sliding median of spearmanr and size, interquartile range and 95% CI of spearmanr of randomly paired genes Parameters ---------- neighbors: neighboring gene pairs dataframe window: size of window for sliding median Returns ------- rolling_median: sliding median of spearmanr and size with IQR for spearmanr median and 95% confidence interval of median from random pairs """ #load dataframe if not provided yet if isinstance(neighbors , basestring): neighbors = pd.read_csv(neighbors) if compute_random and isinstance(random , basestring): random = pd.read_csv(random) # sort by size to do sliding window with increasing intergenic distance # nans cause error in sliding median neighbors = neighbors.sort('size').dropna() print 'computing sliding median...' # compute rolling medians. 1000 looks good, less is unnecesserily heavy and noisy. rolling_median_spearmanr = pd.rolling_median(neighbors.spearmanr, window) print 'computing IQR...' # compute interquartile range (IQR). Top 75% and bottom 25%. rolling_spearmanr_q1 = - pd.rolling_quantile(neighbors.spearmanr, window, 0.25) + \ rolling_median_spearmanr rolling_spearmanr_q3 = pd.rolling_quantile(neighbors.spearmanr, window, 0.75) - \ rolling_median_spearmanr rolling_median_size = pd.rolling_median(neighbors['size'], window)/1000 # put it all together rolling_median_s = pd.DataFrame({'spearmanr': rolling_median_spearmanr, 'size':rolling_median_size, 'q1': rolling_spearmanr_q1, 'q3': rolling_spearmanr_q3}) # drop all nans from sliding median (first 1000 because of window) rolling_median_s = rolling_median_s.dropna() # reindex is necessary rolling_median_s.index = np.arange(len(rolling_median_s)) if compute_random: print 'computing random pairs median CI' # compute 95% confidence interval of median in random pairs ci_median = bs.ci(random.spearmanr.dropna().loc[:20000], np.median) rolling_median_s['random_lci'] = ci_median[0] rolling_median_s['random_hci'] = ci_median[1] print 'fitting to exp decay...' popt_s, pcov_s = curve_fit(exp_decay, rolling_median_s['size'], rolling_median_s.spearmanr, p0=p0) rolling_median_s['popt1'] = popt_s[0] rolling_median_s['popt2'] = popt_s[1] rolling_median_s['popt3'] = popt_s[2] print 'done' return rolling_median_s
def VaR_df(df,winsz,qtile): tmpdf = daily_returns_df(df) return pd.rolling_quantile(tmpdf,winsz,qtile)
def extract(self, df, start_time=None, end_time=None): """Begin anomaly extraction on ``df``. Parameters ---------- df : Pandas DataFrame The time-series data. start_time : datetime.datetime or None, optional (default=None) The time to begin extracting anomalies. If None, the entire time-series will be used. end_time : datetime.datetime or None, optional (default=None) The time to stop extracting anomalies. If None, the entire time-series will be used. """ edna_cols = ['ExtendedId', 'Value', 'ValueString', 'Time', 'Status'] self._check_df(df, edna_cols) self._check_anomalies() def get_fdr(id_str): """Get the feeder ID from the eDNA point name.""" fdr_pattern = re.compile(r'[\._][0-9]{6}[\._]') try: fdr = fdr_pattern.findall(id_str)[0][1:7] except IndexError: fdr = 'NULL' return fdr points = pd.DataFrame({'Point': df.ExtendedId.unique()}) points = points[-points.Point.str.contains('Bad point')] points['Feeder'] = points.Point.map(get_fdr) points = points[points.Feeder == self.feeder_id].Point df = df.loc[df.ExtendedId.isin(set(points.values))] df = df.drop_duplicates(subset=edna_cols) if 'FCI_FAULT_ALARM' in self.anomalies: points_sub = points.loc[(points.str.contains(r'\.FCI\.')) & (points.str.contains(r'\.FAULT'))] df_sub = df.loc[(df.ExtendedId.isin(points_sub)) & (df.ValueString != 'NORMAL')] self._store(df_sub, 'FCI_FAULT_ALARM', start_time, end_time) if set(self.anomalies) & {'FCI_I_FAULT_FULL', 'FCI_I_FAULT_TEMP'}: points_sub = points.loc[(points.str.contains(r'\.FCI\.')) & (points.str.contains(r'\.I_FAULT'))] df_sub = df.loc[(df.ExtendedId.isin(points_sub)) & (df.Value >= 600.)].copy() df_sub['Anomaly'] = 'FCI_I_FAULT_FULL' df_sub.loc[df.Value < 900., 'Anomaly'] = 'FCI_I_FAULT_TEMP' self._store(df_sub, None, start_time, end_time) if 'AFS_ALARM_ALARM' in self.anomalies: points_sub = points.loc[(points.str.contains(r'\.AFS\.')) & (points.str.contains(r'\.ALARM'))] df_sub = df.loc[(df.ExtendedId.isin(points_sub)) & (df.ValueString == 'ALARM')] self._store(df_sub, 'AFS_ALARM_ALARM', start_time, end_time) if 'AFS_GROUND_ALARM' in self.anomalies: points_sub = points.loc[(points.str.contains(r'\.AFS\.')) & (points.str.contains(r'\.GROUND'))] df_sub = df.loc[(df.ExtendedId.isin(points_sub)) & (df.ValueString == 'ALARM')] self._store(df_sub, 'AFS_GROUND_ALARM', start_time, end_time) if set(self.anomalies) & {'AFS_I_FAULT_FULL', 'AFS_I_FAULT_TEMP'}: points_sub = points.loc[(points.str.contains(r'\.AFS\.')) & (points.str.contains(r'\.I_FAULT'))] df_sub = df.loc[(df.ExtendedId.isin(points_sub)) & (df.Value >= 600.)].copy() df_sub['Anomaly'] = 'AFS_I_FAULT_FULL' df_sub.loc[df.Value < 900., 'Anomaly'] = 'AFS_I_FAULT_TEMP' self._store(df_sub, None, start_time, end_time) # Okay to remove SET/NOT-SET rows now, those just matter for AFSs/FCIs df = df.loc[df.Status == 'OK'] if 'ZERO_CURRENT_V3' in self.anomalies: points_sub = points.loc[(points.str.contains(r'\.I\.')) & (points.str.contains(r'_PH')) & (points.str.contains(r'\.FDR\.')) & (-points.str.contains(r'BKR\.'))] df_sub = df.loc[df.ExtendedId.isin(points_sub)] for point_id in df_sub.ExtendedId.unique(): df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id] df_sub2 = df_sub2.sort_values(by='Time') window = self._get_window(df_sub2, 24) if window == -1: continue anoms = df_sub2[(df_sub2.Value < 1) & ( df_sub2.Value > -0.5) & (pd.rolling_quantile( df_sub2.Value, window, 0.01) > 10)].Time.tolist() anoms = np.array(anoms) anoms = [ e for e in anoms if df_sub2[(df_sub2.Time <= e) & (df_sub2.Time > (e - DAY))].shape[0] > 24 ] df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)] self._store(df_sub2, 'ZERO_CURRENT_V3', start_time, end_time) if 'ZERO_CURRENT_V4' in self.anomalies: points_sub = points.loc[(points.str.contains(r'\.I\.')) & (points.str.contains(r'_PH')) & (points.str.contains(r'\.FDR\.')) & (-points.str.contains(r'BKR\.'))] df_sub = df.loc[df.ExtendedId.isin(points_sub)] for point_id in df_sub.ExtendedId.unique(): df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id] df_sub2 = df_sub2.sort_values(by='Time') df_sub2['LowValue'] = (df_sub2.Value < 1) & (df_sub2.Value > -0.5) df_sub2['OkayValue'] = df_sub2.Value >= 1 df_sub2['OkayValue'] = df_sub2.OkayValue.shift() df_sub2.OkayValue = df_sub2.OkayValue.fillna(False) df_sub2['ZeroValue'] = df_sub2.LowValue & df_sub2.OkayValue anoms = df_sub2[df_sub2.ZeroValue].Time.tolist() anoms = [ e for e in anoms if df_sub2[(df_sub2.OkayValue) & (df_sub2.Time <= e) & (df_sub2.Time > (e - DAY))].shape[0] > 24 ] df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)] self._store(df_sub2, 'ZERO_CURRENT_V4', start_time, end_time) if 'PF_SPIKES_V3' in self.anomalies: points_sub = points.loc[(points.str.contains(r'\.PF\.')) & (points.str.contains(r'_PH')) & (points.str.contains(r'\.FDR\.')) & (-points.str.contains(r'BKR\.'))] df_sub = df.loc[df.ExtendedId.isin(points_sub)] for point_id in df_sub.ExtendedId.unique(): df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id] df_sub2 = df_sub2.sort_values(by='Time') window = self._get_window(df_sub2, 24) if window == -1: continue anoms = df_sub2[(df_sub2.Value.abs() < 0.75) & (pd.rolling_quantile(df_sub2.Value.abs( ), window, 0.01) > 0.8)].Time.tolist() anoms = np.array(anoms) anoms = [ e for e in anoms if df_sub2[(df_sub2.Time <= e) & (df_sub2.Time > (e - DAY))].shape[0] > 24 ] df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)] self._store(df_sub2, 'PF_SPIKES_V3', start_time, end_time) if 'ZERO_POWER_V3' in self.anomalies: points_sub = points.loc[(points.str.contains(r'\.MW')) & (points.str.contains(r'\.FDR\.')) & (-points.str.contains(r'BKR\.'))] df_sub = df.loc[df.ExtendedId.isin(points_sub)] for point_id in df_sub.ExtendedId.unique(): df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id] df_sub2 = df_sub2.sort_values(by='Time') window = self._get_window(df_sub2, 24) if window == -1: continue anoms = df_sub2[(df_sub2.Value < 0.1) & ( df_sub2.Value > -0.5) & (pd.rolling_quantile( df_sub2.Value, window, 0.01) > 0.5)].Time.tolist() anoms = np.array(anoms) anoms = [ e for e in anoms if df_sub2[(df_sub2.Time <= e) & (df_sub2.Time > (e - DAY))].shape[0] > 24 ] df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)] self._store(df_sub2, 'ZERO_POWER_V3', start_time, end_time) if 'ZERO_POWER_V4' in self.anomalies: points_sub = points.loc[(points.str.contains(r'\.MW')) & (points.str.contains(r'\.FDR\.')) & (-points.str.contains(r'BKR\.'))] df_sub = df.loc[df.ExtendedId.isin(points_sub)] for point_id in df_sub.ExtendedId.unique(): df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id] df_sub2 = df_sub2.sort_values(by='Time') df_sub2['LowValue'] = (df_sub2.Value < 0.1) & (df_sub2.Value > -0.5) df_sub2['OkayValue'] = df_sub2.Value >= 0.1 df_sub2['OkayValue'] = df_sub2.OkayValue.shift() df_sub2.OkayValue = df_sub2.OkayValue.fillna(False) df_sub2['ZeroValue'] = df_sub2.LowValue & df_sub2.OkayValue anoms = df_sub2[df_sub2.ZeroValue].Time.tolist() anoms = [ e for e in anoms if df_sub2[(df_sub2.OkayValue) & (df_sub2.Time <= e) & (df_sub2.Time > (e - DAY))].shape[0] > 24 ] df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)] self._store(df_sub2, 'ZERO_POWER_V4', start_time, end_time) if 'THD_SPIKES_V3' in self.anomalies: points_sub = points.loc[(points.str.contains(r'\.THD_')) & (points.str.contains(r'urrent'))] df_sub = df.loc[df.ExtendedId.isin(points_sub)] for point_id in df_sub.ExtendedId.unique(): df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id] df_sub2 = df_sub2.sort_values(by='Time') window = self._get_window(df_sub2, 24) if window == -1: continue df_sub2['roll'] = pd.rolling_mean(df_sub2.Value, window) df_sub2['stdev'] = pd.rolling_std(df_sub2.Value, window) df_sub2['threshold'] = df_sub2.roll + (7 * df_sub2.stdev) df_sub2.threshold = df_sub2.threshold.shift() anoms = df_sub2.loc[df_sub2.threshold < df_sub2.Value] anoms = [ e for e in anoms.Time.tolist() if df_sub2[(df_sub2.Time <= e) & (df_sub2.Time > (e - DAY))].shape[0] > 24 ] df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)] self._store(df_sub2, 'THD_SPIKES_V3', start_time, end_time) if 'ZERO_VOLTAGE_V3' in self.anomalies: points_sub = points.loc[(points.str.contains(r'\.V\.')) & (points.str.contains(r'_PH')) & (points.str.contains(r'\.FDR\.')) & (-points.str.contains(r'BKR\.'))] df_sub = df.loc[df.ExtendedId.isin(points_sub)] for point_id in df_sub.ExtendedId.unique(): df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id] df_sub2 = df_sub2.sort_values(by='Time') window = self._get_window(df_sub2, 24) if window == -1: continue anoms = df_sub2[(df_sub2.Value < 1) & ( df_sub2.Value > -0.5) & (pd.rolling_quantile( df_sub2.Value, window, 0.01) > 90)].Time.tolist() anoms = np.array(anoms) anoms = [ e for e in anoms if df_sub2[(df_sub2.Time <= e) & (df_sub2.Time > (e - DAY))].shape[0] > 24 ] df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)] self._store(df_sub2, 'ZERO_VOLTAGE_V3', start_time, end_time) if 'ZERO_VOLTAGE_V4' in self.anomalies: points_sub = points.loc[(points.str.contains(r'\.V\.')) & (points.str.contains(r'_PH')) & (points.str.contains(r'\.FDR\.')) & (-points.str.contains(r'BKR\.'))] df_sub = df.loc[df.ExtendedId.isin(points_sub)] for point_id in df_sub.ExtendedId.unique(): df_sub2 = df_sub.loc[df_sub.ExtendedId == point_id] df_sub2 = df_sub2.sort_values(by='Time') df_sub2['LowValue'] = (df_sub2.Value < 1) & (df_sub2.Value > -0.5) df_sub2['OkayValue'] = df_sub2.Value >= 1 df_sub2['OkayValue'] = df_sub2.OkayValue.shift() df_sub2.OkayValue = df_sub2.OkayValue.fillna(False) df_sub2['ZeroValue'] = df_sub2.LowValue & df_sub2.OkayValue anoms = df_sub2[df_sub2.ZeroValue].Time.tolist() anoms = [ e for e in anoms if df_sub2[(df_sub2.OkayValue) & (df_sub2.Time <= e) & (df_sub2.Time > (e - DAY))].shape[0] > 24 ] df_sub2 = df_sub2.loc[df_sub2.Time.isin(anoms)] self._store(df_sub2, 'ZERO_VOLTAGE_V4', start_time, end_time) return self
def decision(date, cftc_cln, lme_df, p_df): date = pd.to_datetime(date) cftc_cln = cftc_cln[cftc_cln['report_date'] < date] lme_df = lme_df[lme_df.index < date] p_df = p_df[p_df.index < date] ln = 100 if len(lme_df) >= ln and len(cftc_cln) >= 30: regr_df = pd.concat( [lme_df[['NET_COMM_PER']], cftc_cln[['non_comm_per']]], axis=1, join='inner') regr_df = regr_df.dropna(axis=0) #calculation of cftc_estimation X = np.array(regr_df['NET_COMM_PER']) X = sm.add_constant(X) Y = np.array(regr_df['non_comm_per']) X = X[-29:, ] Y = Y[-29:, ] lm = sm.OLS(Y, X) result = lm.fit() const = result.params[0] coef = result.params[1] df_all = pd.DataFrame() df_all['update_date'] = lme_df.index # lme_df['cftc']=lme_df['NET_COMM_PER']*coef+const # df_all['cftc']=lme_df['cftc'].values df_all['cftc'] = lme_df['NET_COMM_PER'].values df_all.index = [df_all['update_date']] df_all = df_all.drop(['update_date'], axis=1) #the analysis on lme and cftc completed, nxt deal with price(p_df) df_all = pd.concat([df_all, p_df], axis=1, join='inner') df_all['open_ma'] = pd.rolling_mean(df_all['open'], 5) df_all['cftc_ma'] = pd.rolling_mean(df_all['cftc'], 5) df_all['open_ma_diff'] = df_all['open_ma'].diff() df_all['cftc_ma_diff'] = df_all['cftc_ma'].diff() p = 0.6 p_ = 1 - p df_all['o_up_thr'] = pd.rolling_quantile(df_all['open_ma_diff'], ln, p) df_all['o_low_thr'] = pd.rolling_quantile(df_all['open_ma_diff'], ln, p_) df_all['cftc_up_thr'] = pd.rolling_quantile(df_all['cftc_ma_diff'], ln, p) df_all['cftc_low_thr'] = pd.rolling_quantile(df_all['cftc_ma_diff'], ln, p_) def cc_2(x, u, l): if x >= u: return 1 elif x <= l: return -1 else: return 0 df_all['open_ma_sig'] = map(cc_2, df_all['open_ma_diff'], df_all['o_up_thr'], df_all['o_low_thr']) df_all['cftc_ma_sig'] = map(cc_2, df_all['cftc_ma_diff'], df_all['cftc_up_thr'], df_all['cftc_low_thr']) def sig(x, y): if x == y and x != 0: return x elif x * y == -1: return -x else: return 0 df_all['dir'] = map(sig, df_all['open_ma_sig'], df_all['cftc_ma_sig']) if len(df_all) > 0: today_dir = df_all.iloc[-1, :]['dir'] return today_dir else: return np.nan else: return np.nan
def rolling_quantiles(self, window=30, quantiles=[0.25, 0.75]): '''Plots rolling quantiles of volatility Parameters ---------- window : int Rolling window for which to calculate the estimator quantiles : [lower, upper] List of lower and upper quantiles for which to plot ''' if len(quantiles) != 2: raise ValueError( 'A two element list of quantiles is required, lower and upper') if quantiles[0] + quantiles[1] != 1.0: raise ValueError('The sum of the quantiles must equal 1.0') if quantiles[0] > quantiles[1]: raise ValueError( 'The lower quantiles (first element) must be less than the upper quantile (second element)' ) estimator = self._get_estimator(window) date = estimator.index top_q = pandas.rolling_quantile(estimator, window, quantiles[1]) median = pandas.rolling_median(estimator, window) bottom_q = pandas.rolling_quantile(estimator, window, quantiles[0]) realized = estimator last = estimator[-1] if self._type is "Skew" or self._type is "Kurtosis": f = lambda x: "%i" % round(x, 0) else: f = lambda x: "%i%%" % round(x * 100, 0) ''' Figure args ''' fig = plt.figure(figsize=(8, 6)) left, width = 0.07, 0.65 bottom, height = 0.2, 0.7 bottom_h = left_h = left + width + 0.02 rect_cones = [left, bottom, width, height] rect_box = [left_h, bottom, 0.17, height] cones = plt.axes(rect_cones) box = plt.axes(rect_box) ''' Cones plot args ''' # set the plots cones.plot(date, top_q, label=str(int(quantiles[1] * 100)) + " Prctl") cones.plot(date, median, label="Median") cones.plot(date, bottom_q, label=str(int(quantiles[0] * 100)) + " Prctl") cones.plot(date, realized, 'r-.', label="Realized") # set and format the y-axis labels locs = cones.get_yticks() cones.set_yticklabels(map(f, locs)) # turn on the grid cones.grid(True, axis='y', which='major', alpha=0.5) # set the title cones.set_title(self._type + ' (' + self._ticker + ', daily ' + self._start.strftime("%Y-%m-%d") + ' to ' + self._end.strftime("%Y-%m-%d") + ')') # set the legend cones.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=3) ''' Box plot args ''' # set the plots box.boxplot(realized, notch=1, sym='+') box.plot(1, last, color='r', marker='*', markeredgecolor='k') # set and format the y-axis labels locs = box.get_yticks() box.set_yticklabels(map(f, locs)) # move the y-axis ticks on the right side box.yaxis.tick_right() # turn on the grid box.grid(True, axis='y', which='major', alpha=0.5) return fig, plt