Example #1
0
 def predict(self, data=None):
     if(self.use_period):
         # decomfreq = freq
         res = sm.tsa.seasonal_decompose(self.data.tolist(), freq=self.freq, model=self.model)
         #     res.plot()
         median_trend = pd.rolling_median(Series(self.data),window=self.freq, center=True, min_periods=1)
         resid = res.observed - res.seasonal - median_trend
     else:
         resid = self.data
     random = Series(resid)
     mean_nan = 0
     std_nan = 0
     # random = res.resid
     if (self.mode == 'average'):
         mean_nan = np.nanmean(random)
         std_nan = np.nanstd(random)
     elif (self.mode == 'median'):
         rolling_median = pd.rolling_median(random,3,center=True, min_periods=1)
         mean_nan = np.nanmean(rolling_median)
         std_nan = np.nanstd(rolling_median)
     min_val = mean_nan - 4 * std_nan
     # max_val = mean(random, na.rm = T) + 4*sd(random, na.rm = T)
     max_val = mean_nan + 4 * std_nan
     position = Series(resid.tolist(), index=np.arange(resid.shape[0]))
     anomaly = position[(position > max_val) | (position < min_val)]
     # anomalyL = position[(position<min_val)]
     # anomaly = anomalyH.append(anomalyL).drop_duplicates()
     point_anomaly_idx = anomaly.index
     self.anomaly_idx = point_anomaly_idx
     points_anomaly = self.data[point_anomaly_idx]
     self.anomalies = points_anomaly
     return points_anomaly
def buildTS(path):
    df = pd.read_csv(path)
    df = df[[
        'gmDate', 'playDispNm', 'teamAbbr', 'teamDayOff', 'playPos',
        'playStat', 'playMin'
    ]]
    datfrm = datfrm = pd.DataFrame()
    for i in df.playDispNm.unique():
        pdf = df[df.playDispNm == i].sort_values(by='gmDate')
        pdf['prevgm'] = pdf.playMin.shift(1)
        pdf['pavg3'] = pd.rolling_mean(pdf.playMin, 3)
        pdf['pavg5'] = pd.rolling_mean(pdf.playMin, 5)
        pdf['pavg10'] = pd.rolling_mean(pdf.playMin, 10)
        #pdf['pavg20'] = pd.rolling_mean(pdf.playMin,20)
        pdf['pmed3'] = pd.rolling_median(pdf.playMin, 3)
        pdf['pmed5'] = pd.rolling_median(pdf.playMin, 5)
        pdf['pmed10'] = pd.rolling_median(pdf.playMin, 10)
        #pdf['pmed20'] = pd.rolling_median(pdf.playMin,20)
        pdf['pstd3'] = pd.rolling_std(pdf.playMin, 3)
        pdf['pstd5'] = pd.rolling_std(pdf.playMin, 5)
        pdf['pstd10'] = pd.rolling_std(pdf.playMin, 10)
        #pdf['pstd20'] = pd.rolling_std(pdf.playMin,20)
        #print(pdf.tail)
        datfrm = datfrm.append(pdf.dropna())
        #print(len(datfrm))
    return datfrm
Example #3
0
def plot_rolling_functions(series, window_size=128):
    pd.rolling_median(series,window_size).plot(label='median')
    pd.rolling_mean(series,window_size).plot(label='mean')
    pd.rolling_std(series,window_size).plot(label='std')
    pd.rolling_skew(series,window_size).plot(label='skew')
    pd.rolling_kurt(series,window_size).plot(label='kurt')
    pd.rolling_min(series,window_size).plot(label='min')
    pd.rolling_max(series,window_size).plot(label='max')
    plt.title('Various rolling window functions, window size %s' % (window_size))
    plt.legend()
    plt.show()
Example #4
0
def plot_downsampled_rolling_median(series, window_size=64, original_freq=512, freq=10):
    median = pd.rolling_median(series, window_size)
    step = original_freq/freq
    downsampled = pd.rolling_median(series, window_size)[::step]
    pd.Series(series).plot()

    downsampled.plot()
    plt.title('rolling_median, window_size=%s, downsampled to %sHz' % (window_size, freq))
    annotations = [
        plt.annotate(int(val), (step*index, val))
        for index,val in enumerate(downsampled)
        if not np.isnan(val)
        ]
    plt.legend(('original', 'rolling_median'))
    plt.show()
Example #5
0
def plot_rolling_subplot(ax, series, labels, colors):
    """Subplot with smoothed values (using moving median)."""
    for s, label, color in zip(series, labels, colors):
        rolling_median = pd.rolling_median(s, window=5)
        ax.plot(s.index, rolling_median, label=label, color=color)
        ymin, ymax = ax.get_ylim()
        plt.ylim(ymin=0, ymax=max(1, ymax * 1.05))
Example #6
0
def smooth_holidays(df, holidays_df, length=1):
    # TODO: Add support for hours
    mmd_df = pd.rolling_median(df, 7)
    indices = df.loc[holidays_df.index.values].dropna().index.values
    indices = pad_dates(indices, freq='D', length=1)
    df = replace_rows(df, mmd_df, indices)
    return df
Example #7
0
def get_meet_flt(df_meet, window=8):
    """
    data processing with median filter.

    """
    df_flt = pd.rolling_median(df_meet, window=window)
    return df_flt
Example #8
0
def rolling_median(x, width):
    """Rolling median with mirrored edges."""
    x, wing = check_inputs(x, width)
    # Pad the edges of the original array with mirror copies
    signal = np.concatenate((x[wing-1::-1], x, x[:-wing-1:-1]))
    rolled = pd.rolling_median(signal, 2 * wing + 1, center=True)
    return rolled[wing:-wing]
Example #9
0
def smooth_holidays(df, holidays_df, length=1):
    """
  Smoothes out peeks and troughs in df, using indices in holiday_df.

  Parameters
  ----------
  df : pd.DataFrame
    Data Frame to smooth.

  holidays_df : pd.DataFrame
    Defines the indices to use when smoothing. The function does not care about the values
    in holiday_df, only if they are present for a certain row index. This index is then
    used to smooth df.

  length : int, default 1
    This argument is passed to pad_dates(). This defines the 'padding' to the holiday.
    Assume may 1. is a holiday. If length = 1, then april 30. may 1. and may 2. will
    be replaced with a 'smoothed' value.

  Returns
  -------
  smoothed_df : pd.DataFrame
    Data frame with holidays and days leading up to and after, according to length, smoothed.
  """
    # TODO: Add support for hours
    mmd_df = pd.rolling_median(df, 7)
    indices = df.loc[holidays_df.index.values].dropna().index.values
    indices = pad_dates(indices, freq='D', length=length)
    df = replace_rows(df, mmd_df, indices)
    return df
Example #10
0
def movingmedian(interval, window_size):
    if pandas:  ### use pandas implementation if available
        tmp = numpy.copy(interval)
        if pandas.__version__ >= '0.18.1':
            tmp[window_size:len(interval) - window_size] = numpy.array(
                pandas.Series(tmp).rolling(
                    2 * window_size).median()[2 * window_size - 1:-1])
        else:
            tmp[window_size:len(interval) -
                window_size] = pandas.rolling_median(
                    tmp, 2 * window_size)[2 * window_size - 1:-1]
    else:
        interval = list(interval)
        tmp = numpy.copy(interval)
        A = None
        As = None
        prev = None
        for i in range(window_size, len(interval) - window_size):
            if A is None:
                A = interval[i - window_size:i + window_size]
                ix = numpy.argsort(A)
                As = list(numpy.array(A)[ix])
            else:
                newdata = interval[i + window_size - 1]
                A = A + [newdata]
                bisect.insort(As, newdata)
            if len(As) % 2:
                tmp[i] = As[len(As) // 2]
            else:
                tmp[i] = (As[len(As) // 2 - 1] + As[len(As) // 2]) / 2.
            prev = A.pop(0)
            del As[bisect.bisect_left(As, prev)]
    return tmp
Example #11
0
    def get_order(cls,
                  data,
                  segments=2,
                  window=7,
                  writer=None,
                  charts=False,
                  verbose=False):
        ''' generate orders from segtrends '''
        price = data[cls.field]
        x_maxima, maxima, x_minima, minima = segtrends(price,
                                                       segments,
                                                       window,
                                                       charts=charts)

        if writer or cls.predict:
            features = cls.get_order_features_from_trend(
                segments, x_maxima, maxima, x_minima, minima)
            vol_pct_change = data['Volume'][-(window +
                                              1):].pct_change()[-window:]
            last = data[cls.field][-1]
            roll_mean_var = (pd.rolling_mean(data[cls.field][-window:],
                                             window)[-1] - last) / last
            roll_median_var = (pd.rolling_median(data[cls.field][-window:],
                                                 window)[-1] - last) / last
            for add in (vol_pct_change, roll_mean_var, roll_median_var):
                features = np.append(features, add)
            if writer:
                writer.writerow(features)

        if cls.predict:
            order = -1 if cls.predict([features]) == 0 else 1
            return order
        else:
            return cls.get_order_from_trend(minima, maxima, verbose)
Example #12
0
def LowPassFilter(values):
    threshold = values.mean() + 3 * values.std()
    ResE = rolling_median(
        values, window=15,
        center=True).fillna(method='bfill').fillna(method='ffill')
    #ResE = NormValues(ResEtemp)
    return ResE
def get_scatter_data_for_code_vol(
        system, instrument_code, rule_name, return_period=5, days=64):

    denom_price = system.rawdata.daily_denominator_price(instrument_code)
    x = system.rawdata.daily_returns(instrument_code)
    vol = robust_vol_calc(x, days)
    perc_vol = 100.0 * divide_df_single_column(vol, denom_price.shift(1))

    volavg = pd.rolling_median(perc_vol, 1250, min_periods=10)
    vol_qq = (perc_vol - volavg) / volavg

    # work out return for the N days after the forecast

    norm_data = system.accounts.pandl_for_instrument_forecast(
        instrument_code, rule_name)

    (vol_qq, norm_data) = align_to_joint(
        vol_qq, norm_data, ffill=(True, False))

    period_returns = pd.rolling_sum(norm_data, return_period, min_periods=1)

    ex_post_returns = period_returns.shift(-return_period)
    lagged_vol = vol_qq.shift(1)

    return (list(ex_post_returns.iloc[:, 0].values), list(
        lagged_vol.iloc[:, 0].values))
Example #14
0
def rolling_functions_tests(p, d):
    # Old-fashioned rolling API
    assert_eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3))
    assert_eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3))
    assert_eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3))
    assert_eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3))
    assert_eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3))
    assert_eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3))
    assert_eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3))
    assert_eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3))
    # see note around test_rolling_dataframe for logic concerning precision
    assert_eq(pd.rolling_skew(p, 3),
              dd.rolling_skew(d, 3), check_less_precise=True)
    assert_eq(pd.rolling_kurt(p, 3),
              dd.rolling_kurt(d, 3), check_less_precise=True)
    assert_eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5))
    assert_eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad))
    with ignoring(ImportError):
        assert_eq(pd.rolling_window(p, 3, 'boxcar'),
                  dd.rolling_window(d, 3, 'boxcar'))
    # Test with edge-case window sizes
    assert_eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0))
    assert_eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1))
    # Test with kwargs
    assert_eq(pd.rolling_sum(p, 3, min_periods=3),
              dd.rolling_sum(d, 3, min_periods=3))
Example #15
0
def is_spike(series, window_size=3, threshold=3, scale=True):
    """
    Flags spikes in an array-like object using a median filter of `window_size`
    and a `threshold` for the median difference.  If `scale=False` the
    differences are not scale by the data standard deviation and the masking
    is "aggressive."

    Examples
    --------
    >>> from pandas import Series, date_range
    >>> series = [33.43, 33.45, 34.45, 90.0, 35.67, 34.9, 43.5, 34.6, 33.7]
    >>> series = Series(series, index=date_range('1980-01-19',
    ...                 periods=len(series)))
    >>> series[is_spike(series, window_size=3, threshold=3, scale=False)]
    1980-01-22    90.0
    1980-01-25    43.5
    dtype: float64
    >>> series[is_spike(series, window_size=3, threshold=3, scale=True)]
    1980-01-22    90
    Freq: D, dtype: float64

    """
    # bfill+ffil needs a series and won't affect the median.
    series = Series(series)
    medians = rolling_median(series, window=window_size, center=True)
    medians = medians.fillna(method='bfill').fillna(method='ffill')
    difference = np.abs(series - medians).values
    if scale:
        return difference > (threshold*difference.std())
    return difference > threshold
Example #16
0
def GapFlat(time, flux, order=3, maxgap=0.125):
    '''

    Parameters
    ----------

    Returns
    -------
    Data with polymonials removed
    '''
    _, dl, dr = FindGaps(time,
                         maxgap=maxgap)  # finds right edge of time windows

    tot_med = np.nanmedian(flux)  # the total from all quarters

    flux_flat = np.array(flux, copy=True)

    for i in range(0, len(dl)):
        krnl = int(float(dl[i] - dr[i]) / 100.0)
        if (krnl < 10):
            krnl = 10
        flux_sm = rolling_median(flux[dl[i]:dr[i]], krnl)

        indx = np.isfinite(flux_sm)

        fit = np.polyfit(time[dl[i]:dr[i]][indx], flux_sm[indx], order)

        flux_flat[dl[i]:dr[i]] = flux[dl[i]:dr[i]] - \
                                 np.polyval(fit, time[dl[i]:dr[i]]) + \
                                 tot_med
    return flux_flat
Example #17
0
def get_scatter_data_for_code_vol(system,
                                  instrument_code,
                                  rule_name,
                                  return_period=5,
                                  days=64):

    denom_price = system.rawdata.daily_denominator_price(instrument_code)
    x = system.rawdata.daily_returns(instrument_code)
    vol = robust_vol_calc(x, days)
    perc_vol = 100.0 * divide_df_single_column(vol, denom_price.shift(1))

    volavg = pd.rolling_median(perc_vol, 1250, min_periods=10)
    vol_qq = (perc_vol - volavg) / volavg

    ## work out return for the N days after the forecast

    norm_data = system.accounts.pandl_for_instrument_forecast(
        instrument_code, rule_name)

    (vol_qq, norm_data) = align_to_joint(vol_qq,
                                         norm_data,
                                         ffill=(True, False))

    period_returns = pd.rolling_sum(norm_data, return_period, min_periods=1)

    ex_post_returns = period_returns.shift(-return_period)
    lagged_vol = vol_qq.shift(1)

    return (list(ex_post_returns.iloc[:, 0].values),
            list(lagged_vol.iloc[:, 0].values))
Example #18
0
def calc_vol_profiles(full_df):
    full_df['dpvolume_med_21'] = np.nan
    full_df['dpvolume_std_21'] = np.nan
    full_df['dpvolume'] = full_df['dvolume'] * full_df['dvwap']
    print("Calculating trailing volume profile...")
    for timeslice in [
            '09:45', '10:00', '10:15', '10:30', '10:45', '11:00', '11:15',
            '11:30', '11:45', '12:00', '12:15', '12:30', '12:45', '13:00',
            '13:15', '13:30', '13:45', '14:00', '14:15', '14:30', '14:45',
            '15:00', '15:15', '15:30', '15:45', '16:00'
    ]:
        timeslice_df = full_df[['dpvolume', 'tradable_med_volume_21', 'close']]
        timeslice_df = timeslice_df.unstack().between_time(
            timeslice, timeslice).stack()
        timeslice_df = timeslice_df.dropna()
        if len(timeslice_df) == 0: continue
        timeslice_df['dpvolume_med_21'] = timeslice_df['dpvolume'].groupby(
            level='sid').apply(lambda x: pd.rolling_median(x.shift(1), 21))
        timeslice_df['dpvolume_std_21'] = timeslice_df['dpvolume'].groupby(
            level='sid').apply(lambda x: pd.rolling_std(x.shift(1), 21))
        m_df = timeslice_df.dropna()
        print(m_df.head())
        print("Average dvol frac at {}: {}".format(
            timeslice,
            (m_df['dpvolume_med_21'] /
             (m_df['tradable_med_volume_21'] * m_df['close'])).mean()))
        full_df.ix[timeslice_df.index,
                   'dpvolume_med_21'] = timeslice_df['dpvolume_med_21']
        full_df.ix[timeslice_df.index,
                   'dpvolume_std_21'] = timeslice_df['dpvolume_std_21']

    return full_df
Example #19
0
def rolling_median(x, width):
    """Rolling median with mirrored edges."""
    x, wing = check_inputs(x, width)
    # Pad the edges of the original array with mirror copies
    signal = np.concatenate((x[wing - 1::-1], x, x[:-wing - 1:-1]))
    rolled = pd.rolling_median(signal, 2 * wing + 1, center=True)
    return rolled[wing:-wing]
Example #20
0
def plot_rolling_subplot(ax, series, labels, colors):
    """Subplot with smoothed values (using moving median)."""
    for s, label, color in zip(series, labels, colors):
        rolling_median = pd.rolling_median(s, window=5)
        ax.plot(s.index, rolling_median, label=label, color=color)
        ymin, ymax = ax.get_ylim()
        plt.ylim(ymin=0, ymax=max(1, ymax * 1.05))
Example #21
0
def despike(data, window = 20, n_deviation = 0.35):


    df = pd.DataFrame(data)
    data_median_pd = df.copy()
    data_diff_pd = df.copy()
    outlier_idx =  df.copy()
    data_despiked = df.copy()
    row,column = data.shape
    
    for n in range(1,column):
        data_median_pd[n] = rolling_median(df[n], window=window, center=True).fillna(method='bfill').fillna(method='ffill')
        data_diff_pd[n] = np.abs(df[n] - data_median_pd[n]) # deviation from the rolling median

        outlier_idx[n] = data_diff_pd[n] > n_deviation*df[n].std() # spike: if the deviation is much more than the global standard deviation

        data_despiked[n][outlier_idx[n]] = data_median_pd[n][outlier_idx[n]] # replaced the spikes with the rolling median values

        if outlier_idx[n].any(): # plot if any spike is detected
            # the following plots most of the relavent calculations
            #smart_plot([  df[0], df[0][outlier_idx[n] ], df[0], df[0], df[0],df[0] ], [ df[n], df[n][outlier_idx[n]], data_median_pd[n], data_diff_pd[n], df_mad, data_despiked[n] ], x_label='Energy (eV)', y_label='PL',
            #          label = ['data', 'spikes', 'median', 'data-median', 'deviation', 'despiked' ],lines=['-','x','--', '-','-','-'], ms=[1,15,1,1,1,1], annotate = 3, figsize=(16,10))


            smart_plot([ df[0], df[0]  ], [ df[n],  data_despiked[n] ], legend_title='Removing Spikes',label = [data_labels[n], 'After removal'], x_label='Energy (eV)', y_label='PL', lines=['-','-'], ms=[1,15], annotate = 0)

    plt.show()
    return data_despiked
Example #22
0
def create_preds(language):

    TRAIN_FILE = language + "_train.csv"
    TEST_FILE = language + "_test.csv"
    PREDS_FILE = language + "_preds.csv"

    TRAIN_PATH = os.path.join(os.getcwd() + "\\train_test\\" + TRAIN_FILE)
    TEST_PATH = os.path.join(os.getcwd() + "\\train_test\\" + TEST_FILE)
    PREDS_PATH = os.path.join(os.getcwd() + "\\preds\\" + PREDS_FILE)

    train = pd.read_csv(TRAIN_PATH)
    test = pd.read_csv(TEST_PATH)

    train_flattened = pd.melt(train[list(train.columns[-61:-1]) + ['Page']],
                              id_vars='Page',
                              var_name='date',
                              value_name='Visits')
    grouped = train_flattened.groupby(['Page'])['Visits']
    rolling_meds = pd.rolling_median(grouped, window=7)

    test['Visits'] = rolling_meds.values
    test.loc[test.Visits.isnull(), 'Visits'] = 0.0

    test[['Id', 'Visits']].to_csv(PREDS_PATH, index=False)

    del train, test, train_flattened, grouped, rolling_meds
    gc.collect()
Example #23
0
def is_spike(series, window_size=3, threshold=3, scale=True):
    """
    Flags spikes in an array-like object using a median filter of `window_size`
    and a `threshold` for the median difference.  If `scale=False` the
    differences are not scale by the data standard deviation and the masking
    is "aggressive."

    Examples
    --------
    >>> from pandas import Series, date_range
    >>> series = [33.43, 33.45, 34.45, 90.0, 35.67, 34.9, 43.5, 34.6, 33.7]
    >>> series = Series(series, index=date_range('1980-01-19',
    ...                 periods=len(series)))
    >>> series[is_spike(series, window_size=3, threshold=3, scale=False)]
    1980-01-22    90.0
    1980-01-25    43.5
    dtype: float64
    >>> series[is_spike(series, window_size=3, threshold=3, scale=True)]
    1980-01-22    90.0
    Freq: D, dtype: float64

    """
    # bfill+ffil needs a series and won't affect the median.
    series = Series(series)
    medians = rolling_median(series, window=window_size, center=True)
    medians = medians.fillna(method='bfill').fillna(method='ffill')
    difference = np.abs(series - medians).values
    if scale:
        return difference > (threshold * difference.std())
    return difference > threshold
Example #24
0
def rolling_functions_tests(p, d):
    # Old-fashioned rolling API
    assert_eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3))
    assert_eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3))
    assert_eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3))
    assert_eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3))
    assert_eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3))
    assert_eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3))
    assert_eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3))
    assert_eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3))
    # see note around test_rolling_dataframe for logic concerning precision
    assert_eq(pd.rolling_skew(p, 3),
              dd.rolling_skew(d, 3),
              check_less_precise=True)
    assert_eq(pd.rolling_kurt(p, 3),
              dd.rolling_kurt(d, 3),
              check_less_precise=True)
    assert_eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5))
    assert_eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad))
    assert_eq(pd.rolling_window(p, 3, win_type='boxcar'),
              dd.rolling_window(d, 3, win_type='boxcar'))
    # Test with edge-case window sizes
    assert_eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0))
    assert_eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1))
    # Test with kwargs
    assert_eq(pd.rolling_sum(p, 3, min_periods=3),
              dd.rolling_sum(d, 3, min_periods=3))
Example #25
0
 def _remove_outliers(self):
     print "Removing outliers for model fitting"
     temp = []
     for i in [0, 1]:
         median_norm = pd.rolling_median(self.fcArray[i], window=self.rollingMedianWindow, center=True)
         difference = np.abs(self.fcArray[i] - median_norm)
         temp.append(self.fcArray[i][difference < self.outlierThreshold])
     return np.asarray(temp)
Example #26
0
def cleanVWC(df, outDir):
    # Specify the threshold and windowsize for the filter. Thresh is in
    # units of % (VWC), and windowsize counts the number of 15 minute
    # intervals over which to assess the threshold.
    thresh = 30
    windowsize = 96

    # We're going to output the results of the outlier detection to a
    # summary figure for review. Setup the plot structure outside the loop.
    f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
    plt.subplots_adjust(hspace=0.5)
    kw = dict(marker='o', linestyle='none', color='r', markersize=10)
    idx = 0

    # Iterate over the VWC columns in the metstation DF
    for VWCvar in ['VWCC', 'VWCD']:

        # Create column identifiers for the threshold and filtered values
        filtThresh = VWCvar + '_fT'
        filteredVWC = VWCvar + '_f'

        # Rolling median filter with specified window size
        df[filtThresh] = pd.rolling_median(
            df[VWCvar], window=windowsize,
            center=True).fillna(method='bfill').fillna(method='ffill')

        # Tese filtered values against specified threshold
        difftest = np.abs(df[VWCvar] - df[filtThresh])

        # Boolean for values that do not pass the test
        outlier_pos = difftest > thresh

        # Replace filtered values with NaN, so long as there are identified outliers
        df[filteredVWC] = df[VWCvar]
        if outlier_pos[outlier_pos == True].size > 0:
            df[filteredVWC][outlier_pos] = np.nan

        # populate the plot
        df[VWCvar].plot(ax=f.axes[idx], color='gray')
        if outlier_pos[outlier_pos == True].size > 0:
            df[VWCvar][outlier_pos].plot(ax=f.axes[idx], **kw)
            f.axes[idx].set_ylim([0, 105])
        df[filteredVWC].plot(ax=f.axes[idx + 2], color='gray')
        f.axes[idx].set_title(VWCvar)
        idx += 1

    for ax in f.axes:
        ax.set_xlabel('')
    ax1.set_ylabel('Soil VWC (%)')
    ax3.set_ylabel('Soil VWC \nfiltered (%)')

    plt.tight_layout()
    sns.despine()

    plotStationName = df['Locale'][0] + '_' + str(df['LoggerID'][0]) + '_'
    plt.savefig(outDir + 'QAQC/' + plotStationName + 'VWC_Filt.tif')
    plt.close()
    return df
    def rollingStats(self, selectCol = [], splitCol=None, sepCol=None, startTime=None, endTime=None, window=60, quantile=0.1, freq='10s', min_periods=5 ):
        
        df = self.dfSetup()
        
        ## Selects a list of columns to use and splits a column into single type if it contains more than one
        # eg. if a file contains multiple sensor readings 
        if (len(selectCol) > 0):
            dfSub = df[selectCol]
            
        else:
            dfSub = df
        
        if (splitCol and sepCol):
            dfSub = dfSub[dfSub[splitCol] == sepCol]
        
        ## Converts datetime column to datatime object index, then use it to create time slices
        # Time format '2015-10-17 09:00:00' May use the dfOther to use other data frames
        if (startTime and endTime):
            dfSub = dfSub[ startTime : endTime ]
        
        else:
            dfSub = dfSub
        
        if (splitCol):
            dfSub = dfSub.drop(splitCol, axis=1) # Remove columns used to split entries
        
        
        valueName = dfSub.columns.values[0]
        outList = []
        
        counts = pd.rolling_count(dfSub,window,freq=freq).rename(columns = {valueName:'rolling_counts'})
        outList.append(counts)
        
        means = pd.rolling_mean(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_mean'})
        outList.append(means)
        
        rms = np.sqrt(pd.rolling_mean(dfSub**2, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_rms'}) )
        outList.append(rms)
        
        medians = pd.rolling_median(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_median'})
        outList.append(medians)
        
        stds = pd.rolling_std(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_std'})
        outList.append(stds)
        
        mins = pd.rolling_min(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_min'})
        outList.append(mins)
        
        maxs = pd.rolling_max(dfSub, window, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_max'})
        outList.append(maxs)
        
        quants = pd.rolling_quantile(dfSub, window, quantile, min_periods=min_periods, freq=freq).rename(columns = {valueName:'rolling_quantile'})
        outList.append(quants)

        
        dfOut = pd.concat(outList, axis=1)

        return dfOut
 def plot_trend(self, statistic):
     '''Plots trends of specified statistic over time in a specified facility'''
     self.df[statistic].plot()
     stat_smooth = pd.rolling_median(self.df[statistic], 10)
     #plt.plot(stat_smooth, label = 'rolling({k})'.format(k=statistic))
     stat_smooth.plot(label = 'rolling({k})'.format(k=statistic))
     plt.title(statistic + " in " + self.name)
     plt.legend(loc = "best")
     plt.show()
Example #29
0
 def thermal_correction(self, context):
     if not self.num_of_freqs_to_thermal_adjust or self.num_of_freqs_to_thermal_adjust > len(self.big_frequencies):
         return 0
     freqs = self.big_frequencies[-self.num_of_freqs_to_thermal_adjust:]
     spec = context.result.spec
     if spec.frequency not in freqs:
         return 0
     data_path = os.path.join(context.output_directory, 'daq', '{}.csv'.format(self.big_core))
     data = pd.read_csv(data_path)['power']
     return _adjust_for_thermal(data, filt_method=lambda x: pd.rolling_median(x, 1000), thresh=0.9, window=5000)
Example #30
0
def compare_window_sizes(series, sizeList):
    plots = [pd.rolling_median(series, size).plot() for size in sizeList]
    plt.title('Comparison of rolling_median() with different window sizes')
    plt.legend(sizeList)
    plt.show()

    plots = [pd.rolling_mean(series, size).plot() for size in sizeList]
    plt.title('Comparison of rolling_mean() with different window sizes')
    plt.legend(sizeList)
    plt.show()
Example #31
0
def rolling_median(x, width):
    """Rolling median with mirrored edges."""
    x, wing = check_inputs(x, width)
    # Pad the edges of the original array with mirror copies
    signal = np.concatenate((x[wing-1::-1], x, x[:-wing-1:-1]))
    with warnings.catch_warnings():
        # NB: in pandas 0.18+ this function is deprecated
        warnings.simplefilter("ignore", FutureWarning)
        rolled = pd.rolling_median(signal, 2 * wing + 1, center=True)
    return rolled[wing:-wing]
Example #32
0
def stepfilt(x, delta=3, window=7):
    assert window % 2 != 0, 'window size must be odd'
    n = window / 2
    v = np.r_[np.repeat(x[0], n), x, np.repeat(x[-1], n)] # expanded arr
    m = pd.rolling_median(v, window, center=True)         # filtered arr
    for i in range(len(m)-1):
        diff = m[i+1] - m[i]
        if np.abs(diff) > delta:
            v[i+1:] -= diff
    return v[n:-n], m[n:-n]
 def median_detrend(self, window=75): 
     #rolling median to normalise the flux and flux_err read from archive
     f = self._flux.copy()
     f[self.any_intransit] = np.nan
     f_median = pd.rolling_median(f, 75, center=True,
                                  min_periods=1)
     self._detrended_flux = self._flux / f_median
     self._detrended_flux_err = self._flux_err / f_median
     nonan = [i for i in self._detrended_flux_err if ((np.isnan(i)) == False)]
     self._detrended_flux_err_max = np.max(nonan)
 def thermal_correction(self, context):
     if not self.num_of_freqs_to_thermal_adjust or self.num_of_freqs_to_thermal_adjust > len(self.big_frequencies):
         return 0
     freqs = self.big_frequencies[-self.num_of_freqs_to_thermal_adjust:]
     spec = context.result.spec
     if spec.frequency not in freqs:
         return 0
     data_path = os.path.join(context.output_directory, 'daq', '{}.csv'.format(self.big_core))
     data = pd.read_csv(data_path)['power']
     return _adjust_for_thermal(data, filt_method=lambda x: pd.rolling_median(x, 1000), thresh=0.9, window=5000)
Example #35
0
def averaged_median_arr(e):
    rolling_median_window = 3
    try:
        if len(e) >= rolling_median_window:
            e = pandas.rolling_median(
                e, window=rolling_median_window, min_periods=1)
        agg = numpy.mean(e)
    except ValueError:
        agg = None  
    return agg
Example #36
0
def stepfilt(x, delta=3, window=7):
    assert window % 2 != 0, 'window size must be odd'
    n = window / 2
    v = np.r_[np.repeat(x[0], n), x, np.repeat(x[-1], n)]  # expanded arr
    m = pd.rolling_median(v, window, center=True)  # filtered arr
    for i in range(len(m) - 1):
        diff = m[i + 1] - m[i]
        if np.abs(diff) > delta:
            v[i + 1:] -= diff
    return v[n:-n], m[n:-n]
Example #37
0
def rolling_median(x, width):
    """Rolling median with mirrored edges."""
    x, wing = check_inputs(x, width)
    # Pad the edges of the original array with mirror copies
    signal = np.concatenate((x[wing - 1::-1], x, x[:-wing - 1:-1]))
    with warnings.catch_warnings():
        # NB: in pandas 0.18+ this function is deprecated
        warnings.simplefilter("ignore", FutureWarning)
        rolled = pd.rolling_median(signal, 2 * wing + 1, center=True)
    return rolled[wing:-wing]
def get_outliers_multiple_filter(data_frame, component, filter_list):
    import pandas as pd
    from scipy import stats
    import math

    print('Started : get_outliers_multiple_filter')

    if filter_list.z_score is not None:
        data_frame[component + '_z'] = np.abs(stats.zscore(data_frame[component]))
        print('Z-score calculation done.')
    if filter_list.normal_disb is not None:
        mean = data_frame[component].mean()
        std = data_frame[component].std()
        print('Mean and std calculation done.')
    if filter_list.quantile is not None:
        q1 = data_frame[component].quantile(0.25)
        q3 = data_frame[component].quantile(0.75)
        iqr = q3-q1 #Interquartile range
        fence_low  = q1 - filter_list.quantile.interquartile_range_scale*iqr
        fence_high = q3 + filter_list.quantile.interquartile_range_scale*iqr
        print('quantile calculation done.')
    if filter_list.rolling_medians is not None:
        from pandas import rolling_median
        data_frame['r_median'] = rolling_median(data_frame[component], window=3, center=True).fillna(method='bfill').fillna(method='ffill')
        data_frame['r_median_diff'] = np.abs(data_frame[component] - data_frame['r_median'])
        print('Rolling calculation done.')

    index_list_to_drop = []
    for index, row in data_frame.iterrows():
        if filter_list.ab_ignore_min_max is not None:
            if row[component]< filter_list.ab_ignore_min_max.min or row[component]> filter_list.ab_ignore_min_max.max:
                index_list_to_drop.append(index)
        if filter_list.unreal_total_field is not None:
            if math.isnan(row['F']) or row['F'] < filter_list.unreal_total_field.total_field_min or row['F']> filter_list.unreal_total_field.total_field_max:
                index_list_to_drop.append(index)
        if filter_list.ab_ignore_sudden_inc is not None:
            int_index = data_frame.index.get_loc(index)
            if int_index > 1 and abs(row[component] - data_frame.iloc[int_index - 1][component]) > filter_list.ab_ignore_sudden_inc.threshold:
                index_list_to_drop.append(index)
        if filter_list.z_score is not None:
            if row[component + '_z'] > filter_list.z_score.threshold:
                index_list_to_drop.append(index)
        if filter_list.normal_disb is not None:
            if (row[component] < (mean - filter_list.normal_disb.SD_range_scalar * std)) or (row[component] > (mean + filter_list.normal_disb.SD_range_scalar * std)):
                index_list_to_drop.append(index)
        if filter_list.quantile is not None:
            if (row[component] < fence_low) or (row[component] > fence_high):
                index_list_to_drop.append(index)
        if filter_list.rolling_medians is not None:
            if row['r_median_diff'] > filter_list.rolling_medians.threshold:         
                index_list_to_drop.append(index)
            
    result = data_frame.loc[index_list_to_drop]
    print('Done : get_outliers_multiple_filter')
    return result
Example #39
0
def averaged_median_arr(e):
    rolling_median_window = 3
    try:
        if len(e) >= rolling_median_window:
            e = pandas.rolling_median(e,
                                      window=rolling_median_window,
                                      min_periods=1)
        agg = numpy.mean(e)
    except ValueError:
        agg = None
    return agg
Example #40
0
 def median_detrend(self, window=75):
     #rolling median to normalise the flux and flux_err read from archive
     f = self._flux.copy()
     f[self.any_intransit] = np.nan
     f_median = pd.rolling_median(f, 75, center=True, min_periods=1)
     self._detrended_flux = self._flux / f_median
     self._detrended_flux_err = self._flux_err / f_median
     nonan = [
         i for i in self._detrended_flux_err if ((np.isnan(i)) == False)
     ]
     self._detrended_flux_err_max = np.max(nonan)
Example #41
0
def RemoveOutlier(values):
    threshold = values.mean() + 3 * values.std()
    ResEtemp = rolling_median(
        values, window=15,
        center=True).fillna(method='bfill').fillna(method='ffill')
    difference = np.abs(values - ResEtemp)
    outlier_idx = difference > threshold
    #outlier_idx = values > threshold
    values[outlier_idx] = threshold
    ##ResE = NormValues(ResEtemp)
    return values
Example #42
0
def phase_areas(lv_seg):
    '''
    determines approximate end-systole and end-diastole frame indices
    '''
    lv_segs = remove_periphery(lv_seg)
    lv_areas = extract_areas(lv_segs)
    lv_areas = rolling_median(pd.DataFrame(lv_areas)[0], window=3, center=True).fillna(method='bfill').fillna(method='ffill').tolist() 
    x, y = smooth_fft(lv_areas, 2500)
    frame10 = np.argsort(y)[np.int(0.10*len(y))]
    frame90 = np.argsort(y)[np.int(0.90*len(y))]
    return frame10, frame90
Example #43
0
def ProfilePlotVerificador(Data, H, surface,Data2=None, H2=None,scale='log', \
                           labelData='Variable', fecha=dt.datetime(1992,1,15),\
                           name='Prueba.png'):
    """
    Plot a CALIPSO single profile of data
    IMPUTS:
    Data : array 1D of data to plot
    H    : Height of data [km]
    Data2: array 1D of data to plot of LIDAR ground based
    H2   : Height of data of LIDAR ground based [km]
    scale : Type of scale ["linear", "log", "symlog", "logit"]
    lablelData: Name to show in the data plot
    fecha : datetime of the data
    name  : name of the outputfile

    OUTPUTS
    file save in the Path_fig folder
    """
    plt.cla()
    plt.clf()
    plt.close('all')

    fig = plt.figure(figsize=(10, 16))
    ax1 = fig.add_axes([0, 0, 1, 1])
    idx = np.where(H >= surface)[0]
    ax1.plot(Data[idx], H[idx] - H[idx][0], color=AzulChimba, alpha=0.7)
    # ax.plot(Data,H, color=AzulChimba, alpha=0.7)
    # media movil
    a = pd.DataFrame(Data[idx], index=H[idx] - H[idx][0])
    # a = pd.DataFrame(Data, index=H)
    c = pd.rolling_median(a, window=10, min_periods=1, center=True)
    ax1.plot(c.values.ravel(), c.index.values, linewidth=2, color=Azulillo)
    ax1.set_xscale(scale)
    ax1.set_ylabel('Altitude [km]')
    ax1.set_xlabel(labelData)
    if Data2 is not None:
        ax1.plot(Data2[idx], H2[idx] - H2[idx][0], color=Naranja, alpha=0.7)
        a = pd.DataFrame(Data2[idx], index=H2[idx] - H2[idx][0])
        c = pd.rolling_median(a, window=10, min_periods=1, center=True)
        ax1.plot(c.values.ravel(), c.index.values, linewidth=2, color=Naranja)
    plt.savefig(Path_fig + name, transparent=True, bbox_inches='tight')
Example #44
0
    def estimate_diff(self, s1, s2):
        l1 = float(len(s1))
        l2 = float(len(s2))

        # Don't estimate time series that have huge difference in number of
        # elements
        if max(l1, l2) / min(l1, l2) > self.MAX_SIZE_DIFF:
            return -1

        # Don't estimate very small time series
        if min(l1, l2) < self.MIN_DATA_POINTS:
            return -1

        # Heuristic coefficient
        confidence = 0.0

        # Compare correlation coefficient
        if s1.corr(s2) < 0.9:
            confidence += 1.0
        # Compare mean values
        diff = abs(s1.mean() - s2.mean())
        if diff > self.ALLOWED_NOISE * s1.mean():
            confidence += 1.0
        # Compare 4 different percentiles
        for q in (0.5, 0.75, 0.9, 0.95):
            diff = abs(s1.quantile(q) - s2.quantile(q))
            if diff > self.ALLOWED_NOISE * s1.quantile(q):
                confidence += 0.5
        # Compare maximum values
        diff = abs(s1.max() - s2.max())
        if diff > self.ALLOWED_NOISE * s1.max():
            confidence += 1.0

        # Primary trend comparison
        t1 = pd.rolling_median(s1, window=5)
        t2 = pd.rolling_median(s1, window=5)
        if abs((t1 - t2).mean()) > self.ALLOWED_NOISE * t1.mean():
            confidence += 2.0

        # Return confidence as rounded percentage value
        return round(100 * confidence / self.MAX_CONFIDENCE)
Example #45
0
    def estimate_diff(self, s1, s2):
        l1 = float(len(s1))
        l2 = float(len(s2))

        # Don't estimate time series that have huge difference in number of
        # elements
        if max(l1, l2) / min(l1, l2) > self.MAX_SIZE_DIFF:
            return -1

        # Don't estimate very small time series
        if min(l1, l2) < self.MIN_DATA_POINTS:
            return -1

        # Heuristic coefficient
        confidence = 0.0

        # Compare correlation coefficient
        if s1.corr(s2) < 0.9:
            confidence += 1.0
        # Compare mean values
        diff = abs(s1.mean() - s2.mean())
        if diff > self.ALLOWED_NOISE * s1.mean():
            confidence += 1.0
        # Compare 4 different percentiles
        for q in (0.5, 0.75, 0.9, 0.95):
            diff = abs(s1.quantile(q) - s2.quantile(q))
            if diff > self.ALLOWED_NOISE * s1.quantile(q):
                confidence += 0.5
        # Compare maximum values
        diff = abs(s1.max() - s2.max())
        if diff > self.ALLOWED_NOISE * s1.max():
            confidence += 1.0

        # Primary trend comparison
        t1 = pd.rolling_median(s1, window=5)
        t2 = pd.rolling_median(s1, window=5)
        if abs((t1 - t2).mean()) > self.ALLOWED_NOISE * t1.mean():
            confidence += 2.0

        # Return confidence as rounded percentage value
        return round(100 * confidence / self.MAX_CONFIDENCE)
def strat(M,g,j,X_code,Y_code,X_close,X_volume,Y_close,Y_volume):
  """
  This function creates a dataframe with results to a spread trading strategy
  (see HW2 of FINM 33150 - Quantitative Strategies and Regression)
  Inputs:
  M ~ return difference calculation time frame.  M cannot exceed the number of 
  trading days between 2013-12-02 and 2014-01-01
  g ~ entering threshold
  j ~ exiting threshold
  s ~ stop loss threshold
  X_code ~ Quandl code for X
  Y_code ~ Quandl code for Y
  X_close ~ X column name for close
  X_volume ~ X column name for volume
  Y_close ~ Y column name for close
  Y_volume ~ Y column name for volume
  Example of calling function:
  strat(10,0.01,0.008,0.10,'GOOG/NYSE_XSD','YAHOO/SMH','GOOG.NYSE_XSD - Close',
  'GOOG.NYSE_XSD - Volume','YAHOO.SMH - Close','YAHOO.SMH - Volume')
  """
  # grab data using Quandl
  ETF_data = Quandl.get(list((X_code,Y_code)),authtoken=auth,trim_start=start_date,trim_end=end_date,returns="pandas")
  df = pd.DataFrame(ETF_data.ix[:,(X_close,X_volume,Y_close,Y_volume)]) #subset
  df.columns = ['XP','XV','YP','YV']
  df['XDDV'] = df.XP*df.XV # calculate daily dollar volumes
  df['Nt'] = pd.rolling_median(df.XDDV,15).shift(1)# 15 day rolling median
  K = np.max(2*df.Nt) # capital - set K now that we have Nt
  df['XR'] = np.log(df.XP) - np.log(df.XP.shift(1)) #logrets
  df['YR'] = np.log(df.YP) - np.log(df.YP.shift(1))
  df['Delta'] = df.XR-df.YR # difference of X and Y
  df['DeltaM'] = pd.rolling_sum(df.Delta,M).shift(1) #M day historical accumulated difference 
  df = df[df.index >= trade_begin] # drop unnecessary date range
  df['Signal'] = np.nan # add empty trade signal column
  df.Signal[df.DeltaM > g] = 1 # entering or maintaining trade
  df.Signal[df.DeltaM < -g] = -1 # entering or maintaining trade
  df.Signal[np.abs(df.DeltaM) < j] = 0 # exiting or out of trade
  df['EOM'] = np.nan # end of month
  df.EOM[(df.shift(1,freq='B').index.day <= 3) & (df.shift(1,freq='B').index.day-df.index.day < -1)] = 1 # day before 1st day
  df.Signal[(df.shift(1,freq='B').index.day <= 3) & (df.shift(1,freq='B').index.day-df.index.day < -1)] = 0
  df.Signal[((df.Signal == -1) & (df.DeltaM > j)) | (df.Signal == 1) & (df.DeltaM < j)] = 0
  for i in range(1,len(df)):
    if np.isnan(df.Signal[i]):# if between g and j
      df.Signal[i] = df.Signal[i-1] # fill in with current position
  df['Entry'] = 1*(((df.Signal == 1) | (df.Signal == -1)) & ((df.shift(1).Signal == 0) | (np.isnan(df.shift(1).Signal) == True))) # entry point
  df.Entry[((df.Signal == -1) & (df.shift(1).Signal == 1)) | ((df.Signal == 1) & (df.shift(1).Signal == -1))] = 1 # jumping g to -g or vice versa
  df['Exit'] = 1*((df.Signal == 0) & ((df.shift(1).Signal == 1) | ((df.shift(1).Signal == -1)))) # exit point
  df['Nx'] = np.round(-df.Signal*df.Nt/100/df.XP,0) # size of X trade
  df['Ny'] = np.round(df.Signal*df.Nt/100/df.YP,0) # size of Y trade
  df['Profit'] = pd.DataFrame((df.Nx.shift(1)*df.XP.shift(1)*df.XR)+df.Ny.shift(1)*df.YP.shift(1)*df.YR) # dollar profit(loss)
  df['Cum_Profit'] = np.cumsum(df.Profit) #cumulative profit
  df['K'] = np.round(K + df.Cum_Profit,0) # capital based on changes in profit  
  df['Return'] = 252*df.Profit/df.K.shift(1) # annualised returns  
  return df
Example #47
0
 def get_time_series(self, observables):
     for ol in observables:
         for observable in ol:
             if observable:
                 raw_data = self.seriesly.query_data(observable)
                 if raw_data:
                     s = pd.Series(raw_data)
                     if len(s.unique()) == 1:
                         continue
                     s = pd.rolling_median(s, window=3)
                     title = Plotter.generate_title(observable)
                     yield title, s
Example #48
0
    def rolling_median(self, data_frame, periods):
        """
        rolling_median - Calculates the rolling moving average

        Parameters
        ----------
        data_frame : DataFrame
            contains time series
        periods : int
            number of periods in the median

        Returns
        -------
        DataFrame
        """
        return pandas.rolling_median(data_frame, periods)
Example #49
0
def rolling_median(x, width):
    """Rolling median with mirrored edges.

    Contributed by Peter Otten to comp.lang.python.
    This is (somehow) faster than pandas' Cythonized skip-list implementation
    for arrays smaller than ~100,000 elements.

    Source:
    https://bitbucket.org/janto/snippets/src/tip/running_median.py
    https://groups.google.com/d/msg/comp.lang.python/0OARyHF0wtA/SEs-glW4t6gJ
    """
    x, wing = check_inputs(x, width)
    # Pad the edges of the original array with mirror copies
    signal = np.concatenate((x[wing-1::-1], x, x[:-wing-1:-1]))
    rolled = pd.rolling_median(signal, 2 * wing + 1, center=True)
    return rolled[wing:-wing]
def clean_data( df , var , window = 3, threshold = 0.5):
    from pandas import rolling_median
    from numpy import abs

    original_columns = df.columns

    rolling_median = rolling_median(
        df[var],
        window = window,
        center = True,
    )

    df['this_mean'] = ( df[var].mean() + rolling_median ) / 2.

    df = df[ abs( df[var] - df.this_mean ) <= threshold * df[var].std() ]

    return df[ original_columns ]
def interpolate_outliers(angle, data, threshold=0.5, window=12, plot_me=False):
    """
    Function to smooth outliers from the data set. Applys moving
    average smoothing and cyclic boundary conditions. Threshold
    is set by:
    threshold - number of standard deviations from average which defines outliers
    window - number of points in each direction used for average
    """
    df = pd.DataFrame({"parameter": data}, index=angle)
    # mean_data = np.mean(df['parameter'])
    df["data_mean"] = (
        pd.rolling_median(df["parameter"].copy(), window=window, center=True)
        .fillna(method="bfill")
        .fillna(method="ffill")
    )
    difference = np.abs(df["parameter"] - df["data_mean"])  # mean_data)#
    outlier_idx = difference > threshold * df["parameter"].std()
    # df['data_mean'].plot()
    #    s = df['parameter'].copy()
    #    s[outlier_idx] = np.nan
    #    s.interpolate(method='spline', order=1, inplace=True)
    #    df['cleaned_parameter'] = s
    tst = np.array(outlier_idx)
    datamean = np.array(df["data_mean"])
    s = np.array(df["parameter"])
    itms = len(outlier_idx)
    for i in range(itms):
        if (
            tst[i] == True
            or tst[(i - 1) % itms] == True
            or tst[(i + 1) % itms] == True
            or tst[(i - 2) % itms] == True
            or tst[(i + 2) % itms] == True
        ):
            tmp = datamean[i]
            s[i] = tmp
    # print s
    df["cleaned_parameter"] = s

    if plot_me == True:
        figsize = (7, 2.75)
        fig, ax = plt.subplots(figsize=figsize)
        df["parameter"].plot(title="cleaned vs unclean Parameter")
        df["cleaned_parameter"].plot()
        ax.set_ylim(min(df["cleaned_parameter"]), max(df["cleaned_parameter"]))
    return np.array(df["cleaned_parameter"])
Example #52
0
 def noisyUser(df,by,col = 'Power',window = 9):
     """
     Define user with maximum signal noise.
     :param df: pd.DataFrame contains several user's and per one laccid  {pd.DataFrame}
     :param col: column to compute {'str'}
     :param window: rolling window length {'int'}
     :return: name of user {'str'}
     """
     maxNoise = 0
     noises = {}
     grouped = df.groupby(by)
     for user,gr in grouped:
         user_fltrd = pd.rolling_median(gr[col],window,center = True)
         noisy_part = gr[user_fltrd != gr[col]].shape[0]/float(gr.shape[0])
         if noisy_part > maxNoise:
             noisyUser = user
             maxNoise = noisy_part
         noises.update({user:noisy_part})
     return noisyUser,noises
Example #53
0
 def test_ts_median(self):
     self.env.add_operator('ts_median', {
         'operator': OperatorTSMedian,
         'arg1': {'value': [3, 5]},
         })
     string1 = 'ts_median(2, open1)'
     gene1 = self.env.parse_string(string1)
     self.assertFalse(gene1.validate())
     string2 = 'ts_median(3, open1)'
     gene2 = self.env.parse_string(string2)
     self.assertTrue(gene2.validate())
     self.assertEqual(gene2.dimension, 'CNY')
     self.assertRaises(IndexError, gene2.eval, self.env, self.date1, self.date2)
     date1 = self.env.shift_date(self.date1, 2)
     df = pd.rolling_median(self.env.get_data_value('open1'), 3).iloc[2:]
     self.assertTrue(
             frame_equal(
                 gene2.eval(self.env, date1, self.date2),
                 df)
             )
Example #54
0
def step_filt(x, delta=3, window=7):
    """Filter step-changes in a verctor.
    
    Detects level-shifts in a time series and corrects them by levelling both
    sides of the record. 
    
    Discriminates steps from peaks using a moving-median approach.
    """
    assert window % 2 != 0, 'window size must be odd'
    n = window / 2
    v = np.r_[np.repeat(x[0], n), x, np.repeat(x[-1], n)] # expanded arr
    m = pd.rolling_median(v, window, center=True)         # filtered arr
    for i in range(len(m)-1):
        diff = m[i+1] - m[i]
        if np.abs(diff) > delta:
            #plt.plot(v)
            v[i+1:] -= diff
            #plt.plot(v)
            #plt.show()
    return v[n:-n], m[n:-n]
Example #55
0
def rolling_tests(p, d):
    eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3))
    eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3))
    eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3))
    eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3))
    eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3))
    eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3))
    eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3))
    eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3))
    eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3))
    eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3))
    eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5))
    mad = lambda x: np.fabs(x - x.mean()).mean()
    eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad))
    eq(pd.rolling_window(p, 3, 'boxcar'), dd.rolling_window(d, 3, 'boxcar'))
    # Test with edge-case window sizes
    eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0))
    eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1))
    # Test with kwargs
    eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3, min_periods=3))
Example #56
0
def rolling_functions_tests(p, d):
    # Old-fashioned rolling API
    eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3))
    eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3))
    eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3))
    eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3))
    eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3))
    eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3))
    eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3))
    eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3))
    eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3))
    eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3))
    eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5))
    eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad))
    with ignoring(ImportError):
        eq(pd.rolling_window(p, 3, "boxcar"), dd.rolling_window(d, 3, "boxcar"))
    # Test with edge-case window sizes
    eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0))
    eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1))
    # Test with kwargs
    eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3, min_periods=3))
Example #57
0
def rolling_fn(x, w, fn):
  #print "Applying rolling fn %s with window size %d" % (fn, w)
  builtin = {
    np.mean: pandas.rolling_mean, 
    np.median: pandas.rolling_median, 
    np.min: pandas.rolling_min, 
    np.max: pandas.rolling_max, 
    np.var: rolling_var, # not sure why I get NaN from pandas functions 
    np.std: rolling_std, 
    crossing_rate: rolling_crossing_rate, 
  }.get(fn, None)
  if builtin:
    aggregated = builtin(x, w)
  elif fn == mad:
    medians = pandas.rolling_median(x, w)
    abs_diffs = np.abs(x - medians)
    aggregated = pandas.rolling_mean(abs_diffs, w)
  else:
    aggregated = pandas.rolling_apply(x, w, fn)
  n_bad = np.sum(~np.isfinite(aggregated[w:]))
  if n_bad > 0:
    print "[rolling_fn] Number bad entries:", n_bad
  return aggregated