Esempio n. 1
0
def check_spikes(data, chunk_size='2min',
                 detrend=True,
                 detrend_kw={'how':'linear'},
                 visualize=False, vis_col=1, max_consec_spikes=3,
                 cut_func = lambda x: (abs(x - x.mean()) > 5.*x.std()),
                 replace_with='interpolation',
                 max_percent=1.):
    """
    Applies spikes-check according to Vickers and Mahrt (1997)

    Parameters
    ----------
    data: pandas.dataframe
        data to de-spike
    chunk_size: str, int
        size of chunks to consider. If str should be pandas offset string. If int, number of lines.
    detrend: bool
        whether to detrend the data and work  with the fluctuations or to work with the absolute series.
    detrend_kw: dict
        dict of keywords to pass to pymicra.trend in order to detrend data (if detrend==True).
    visualize: bool
        whether of not to visualize the interpolation ocurring
    vis_col: str, int or list
        the column(s) to visualize when seeing the interpolation (only effective if visualize==True)
    max_consec_spikes: int
        maximum number of consecutive spikes to actually be considered spikes and substituted
    cut_func: function
        function used to define spikes
    replace_with: str
        method to use when replacing spikes. Options are 'interpolation' or 'trend'.
    max_percent: float
        maximum percentage of spikes to allow.
    """
    import pandas as pd
    import algs
    import signal as pmdata

    #------------
    if replace_with=='trend':
        def replace_nans(dframe):
            trend = pmdata.trend(dframe, how='linear')
            return dframe.fillna(trend)
    elif replace_with=='interpolation':
        def replace_nans(dframe):
            return dframe.interpolate(method='index', limit_direction='both')
    #------------

    original = data.copy()

    #------------
    # If dentreded == True we save the trend for later and work with the detrended data
    if detrend:
        origtrend = pmdata.trend(data, **detrend_kw)
        detrended = original - origtrend
        dfs = algs.splitData(detrended, rule=chunk_size)
    else:
        dfs = algs.splitData(original, rule=chunk_size)
    #------------

    max_count = int(len(original)*max_percent/100.)
    fault_count = pd.Series(len(original), index=original.columns)

    for i in range(len(dfs)):
        chunk=dfs[i].copy()

        #-------------------------------
        # This substitutes the spikes to NaNs so it can be replaced later
        if len(chunk)>max_consec_spikes:
            chunk=algs.limitedSubs(chunk, max_interp=max_consec_spikes, func=cut_func)
        fault_count = fault_count - chunk.count()
        #-------------------------------

        #-------------------------------
        # Substitution of spikes happens here
        #trend = pmdata.trend(chunk, how='linear')
        #chunk = chunk.fillna(trend)
        chunk = replace_nans(chunk)
        #-------------------------------

        #-------------------------------
        # We change the chunk in the original list of dfs to concatenate later
        dfs[i]=chunk.copy()
        #-------------------------------

    #---------------------
    # Now we put the chunks back together and maybe correct the trend
    despiked = pd.concat(dfs)
    if detrend:
        fou = despiked + origtrend
    else:
        fou = despiked
    valid = fault_count < max_count
    #---------------------

    #---------------------
    # Visualize what you're doing to see if it's correct
    if visualize:
        import matplotlib.pyplot as plt
        print('Plotting de-spiking...')
        original[vis_col].plot(style='g-', label='original')
        fou[vis_col].plot(style='b-', label='final')
        plt.title('Column: {}'.format(vis_col))
        plt.legend()
        plt.show()
        plt.close()
    #---------------------

    return fou, valid, fault_count
Esempio n. 2
0
 def replace_nans(dframe):
     trend = pmdata.trend(dframe, how='linear')
     return dframe.fillna(trend)
Esempio n. 3
0
def check_limits(data, tables, max_percent=1., replace_with='interpolation'):
    """
    Checks dataframe for lower and upper limits. If found, they are substituted by 
    the linear trend of the run. The number of faulty points is also checked for each
    column against the maximum percentage of accepted faults max_percent

    Parameters
    ----------
    data: pandas dataframe
        dataframe to be checked
    tables: pandas.dataframe
        dataframe with the lower and upper limits for variables
    max_percent: float
        number from 0 to 100 that represents the maximum percentage of faulty
        runs accepted by this test.

    Returns
    -------
    df: pandas.DataFrame
        input data but with the faulty points substituted by the linear trend of the run.
    valid: pandas.Series
        True for the columns that passed this test, False for the columns that didn't.
    """
    from . import trend as pmtrend
    import numpy as np
    import algs
    import pandas as pd

    df = data.copy()
    max_count = int(len(df)*max_percent/100.)
    low_count = pd.Series(0, index=tables.columns)
    upp_count = pd.Series(0, index=tables.columns)
    fault_count = pd.Series(0, index=tables.columns)

    #-----------
    # First we check the lower values
    if 'lower_limits' in tables.index.values:
        faulty = df < tables.loc['lower_limits']
        low_count = df[ faulty ].count() 
        df[ faulty ] = np.nan
    #-------------------------------
    
    #-------------------------------
    # Now we check the upper values
    if 'upper_limits' in tables.index.values:
        faulty = df > tables.loc['upper_limits']
        upp_count = df[ faulty ].count() 
        df[ faulty ] = np.nan
    #-------------------------------

    fault_count = low_count + upp_count
    valid = fault_count < max_count

    #------------
    # Replace data with either its trend or by interpolating
    if replace_with=='trend':
        trend = pmdata.trend(df, how='linear')
        df = df.fillna(trend)
    elif replace_with=='interpolation':
        df = df.interpolate(method='index', limit_direction='both')
    #------------

    #-------------------------------
    # Substitute faulty points by the linear trend
    #trend = data.polyfit()
    #df = df.fillna(trend)
    #-------------------------------

    return df, valid, fault_count