Example #1
0
def dropout_rate_per_period(data, rule, window_start=None, window_end=None):
    """
    Parameters
    ----------
    data : pandas.DataFrame or Series or DatetimeIndex

    rule : pandas Offset string (or what ever the `rule` parameter in
        pd.Series.resample accepts)

    window_start, window_end : pd.Timestamp
        The start and end of the window of interest.  If this window
        is larger than the duration of `data` then gaps will be
        appended to the front / back as necessary.  If this window
        is shorter than the duration of `data` data will be cropped.
    
    Returns
    -------
    pd.Series
        Index is a regular DatetimeIndex with freq=rule and
        timezone=data.index.tzinfo
        Values are the number of dropped in that time period.
    """
    # TODO: this might be a rather nasty hack to fix the circular dependency
    from nilmtk.preprocessing.electricity.single import reframe_index

    try:
        data = data.dropna()
    except AttributeError:
        # if data is DatetimeIndex then it has no `dropna()` method
        pass
    
    sample_period_secs = get_sample_period(data)
    n_expected_samples_per_period = (secs_per_period_alias(rule) / 
                                     sample_period_secs)
    if n_expected_samples_per_period < 1.0:
        raise ValueError('Date period specified by rule is shorter than'
                         ' sample period!')

    index = _get_index(data)
    index = reframe_index(index, window_start, window_end)
    n_samples_per_period = (pd.Series(1, index=index)
                            .resample(rule=rule, how='sum')
                            .fillna(0))

    dropout_rate_per_period_ = 1 - (n_samples_per_period / 
                                    n_expected_samples_per_period)

    return dropout_rate_per_period_
Example #2
0
def _indicies_of_periods(datetime_index, freq, use_local_time=True):
    """Find which elements of `datetime_index` fall into each period
    of a regular periods with frequency `freq`.  Uses some tricks to do
    this more efficiently that appears possible with native Pandas tools.

    Parameters
    ----------
    datetime_index : pd.tseries.index.DatetimeIndex

    freq : str
        one of the following:
        'A' for yearly
        'M' for monthly
        'D' for daily
        'H' for hourly
        'T' for minutely

    use_local_time : boolean, optional, default=True
        If True then start and end each time period at appropriate local times.
        e.g. if `freq='D'` and:
            `use_local_time=True` then divide at midnight *local time* or if
            `use_local_time=False` then divide at midnight UTC

    Returns
    -------
    periods : pd.tseries.period.PeriodIndex

    boundaries : dict
        Each key is a pd.tseries.period.Period
        Each value is a tuple of ints:
        (<start index into `datetime_index` for period>, <end index>)
        Periods for which no data exists will not have a key.

    Examples
    --------
    Say you have a pd.Series with data covering a month:

    >>> series.index
    <class 'pandas.tseries.index.DatetimeIndex'>
    [2011-04-18 09:22:13, ..., 2011-05-24 15:56:34]
    Length: 745878, Freq: None, Timezone: US/Eastern

    You want to divide it up into day-sized chunks, starting and ending each
    chunk at midnight local time:

    >>> periods, boundaries = _indicies_of_periods(series.index, freq='D')

    >>> periods
    <class 'pandas.tseries.period.PeriodIndex'>
    freq: D
    [2011-04-18, ..., 2011-05-24]
    length: 37

    >>> boundaries
    {Period('2011-04-18', 'D'): (0, 13652),
     Period('2011-04-19', 'D'): (13652, 34926),
     Period('2011-04-20', 'D'): (34926, 57310),
     ...
     Period('2011-05-23', 'D'): (710750, 732360),
     Period('2011-05-24', 'D'): (732360, 745878)}

    Now, say that we want chomp though our data a day at a time:

    >>> for period in periods:
    >>>     start_i, end_i = boundaries[period]
    >>>     data_for_day = series.iloc[start_i:end_i]
    >>>     # do something with data_for_day

    """

    if use_local_time:
        datetime_index = _tz_to_naive(datetime_index)

    periods = pd.period_range(datetime_index[0], datetime_index[-1], freq=freq)

    # Declare and initialise some constants and variables used
    # during the loop...

    # Find the minimum sample period.
    MIN_SAMPLE_PERIOD = int(get_sample_period(datetime_index))
    MAX_SAMPLES_PER_PERIOD = int(
        secs_per_period_alias(freq) / MIN_SAMPLE_PERIOD)
    MAX_SAMPLES_PER_2_PERIODS = MAX_SAMPLES_PER_PERIOD * 2
    n_rows_processed = 0
    boundaries = {}
    for period in periods:
        # The simplest way to get data for just a single period is to use
        # data_for_day = datetime_index[period.strftime('%Y-%m-%d')]
        # but this takes about 300ms per call on my machine.
        # So we take advantage of several features of the data to achieve
        # a 300x speedup:
        # 1. We use the fact that the data is sorted in order, hence
        #    we can chomp through it in order.
        # 2. MAX_SAMPLES_PER_PERIOD sets an upper bound on the number of
        #    datapoints per period.  The code is conservative and uses
        #    MAX_SAMPLES_PER_2_PERIODS. We only search through a small subset
        #    of the available data.

        end_index = n_rows_processed + MAX_SAMPLES_PER_2_PERIODS
        rows_to_process = datetime_index[n_rows_processed:end_index]
        indicies_for_period = np.where(rows_to_process < period.end_time)[0]
        if indicies_for_period.size > 0:
            first_i_for_period = indicies_for_period[0] + n_rows_processed
            last_i_for_period = indicies_for_period[-1] + n_rows_processed + 1
            boundaries[period] = (first_i_for_period, last_i_for_period)
            n_rows_processed += last_i_for_period - first_i_for_period

    return periods, boundaries
Example #3
0
def activity_distribution(series, on_power_threshold=DEFAULT_ON_POWER_THRESHOLD,
                          bin_size='T', timespan='D'):
    """Returns a distribution describing when this appliance was turned
    on over repeating timespans.  For example, if you want to see
    which times of day this appliance was used, on average, then use 
    bin_size='T' (minutely) or bin_size='H' (hourly) and
    timespan='D' (daily).

    Parameters
    ----------
    series : pandas.Series

    on_power_threshold : float, optional, default=5
        Threshold which defines the difference between 'on' and 'off'. Watts.

    bin_size, timespan : str
        offset alias (e.g. 'T' or 'D')
        For valid offset aliases, see:
        http://pandas.pydata.org/pandas-docs/dev/timeseries.html#offset-aliases

    Returns
    -------
    pandas.Series
        One row for each bin in a timespan.
        The values count the number of times this appliance has been on at
        that particular time of the timespan.
        Times are handled in local time.
        The index uses specific dates. For example, if `timespan='D'` then
        the index might be from '2012/1/1 00:00' to '2012/1/1 59:59'. In this
        example, ignore the '2012/1/1'.
    """

    # TODO: replace this evil hack to handle dataframes(!)
    if isinstance(series, pd.DataFrame):
        series = series.icol(0)

    # Create a pd.Series with PeriodIndex
    binned_data = series.resample(bin_size, how='max').to_period()
    binned_data = binned_data > on_power_threshold

    timespans, boundaries = _indicies_of_periods(
        binned_data.index.to_timestamp(),
        freq=timespan)

    first_timespan = timespans[0]
    bins = pd.period_range(first_timespan.start_time,
                           first_timespan.end_time,
                           freq=bin_size)
    distribution = pd.Series(0, index=bins)

    bins_per_timespan = int(round(secs_per_period_alias(timespan) /
                                  secs_per_period_alias(bin_size)))

    for span in timespans:
        try:
            start_index, end_index = boundaries[span]
        except KeyError:
            print("No data for", span)
            continue
        else:
            data_for_timespan = binned_data[start_index:end_index]

        bins_since_first_timespan = (first_timespan - span) * bins_per_timespan
        data_shifted = data_for_timespan.shift(bins_since_first_timespan,
                                               bin_size)
        distribution = distribution.add(data_shifted, fill_value=0)

    return distribution
Example #4
0
def usage_per_period(series, freq,
                     on_power_threshold=DEFAULT_ON_POWER_THRESHOLD,
                     max_dropout_rate=DEFAULT_MAX_DROPOUT_RATE,
                     verbose=False,
                     energy_unit='kwh'):
    """Calculate the usage (hours on and kwh) per time period.

    If input data has gaps then pre-process data with `insert_zeros`
    before sending it to this function.

    Parameters
    ----------
    series : pd.Series

    freq : str
        see _indicies_of_periods() for acceptable values.

    on_power_threshold : float or int, optional, default = 5
        Threshold which defines the distinction between "on" and "off".  Watts.

    max_dropout_rate : float (0,1), optional, default = 0.4
        Remove any row which has a worse (larger) dropout rate.
    
    verbose : boolean, optional, default = False
        if True then print more information
    
    energy_unit : {'kwh', 'joules'}, optional

    Returns
    -------
    usage : pd.DataFrame
        One row per period (as defined by `freq`).  
        Index is PeriodIndex (UTC).
        Columns:
            hours_on
            <`energy_unit`>

    Examples
    --------
    Say we have loaded fridge data from house_1 in REDD into `fridge` and we
    want to see how it was used each day:

    >>> usage_per_period(fridge, 'D')

                 hours_on       kwh
    2011-04-18        NaN       NaN
    2011-04-19  23.999444  1.104083
    2011-04-20  23.998889  1.293223
    2011-04-21  23.998889  1.138540
    ...
    2011-05-22  23.832500  2.042271
    2011-05-23  23.931111  1.394619
    2011-05-24        NaN       NaN 

    Hmmm... why does the fridge appear to be on for 24 hours per day?
    Inspecting the fridge.plot(), we find that the fridge rarely ever
    gets below this function's default on_power_threshold of 5 Watts,
    so let's specify a larger threshold:

    >>> usage_per_period(fridge, 'D', on_power_threshold=100)

                hours_on       kwh
    2011-04-18       NaN       NaN
    2011-04-19  5.036111  1.104083
    2011-04-20  5.756667  1.293223
    2011-04-21  4.931667  1.138540
    2011-04-22  4.926111  1.076958
    2011-04-23  6.099167  1.357812
    2011-04-24  6.373056  1.361579
    2011-04-25  6.496667  1.441966
    2011-04-26  6.381389  1.404637
    2011-04-27  5.558611  1.196464
    2011-04-28  6.668611  1.478141
    2011-04-29  6.493056  1.446713
    2011-04-30  5.885278  1.263918
    2011-05-01  5.983611  1.351419
    2011-05-02  5.398333  1.167111
    2011-05-03       NaN       NaN
    2011-05-04       NaN       NaN
    2011-05-05       NaN       NaN
    2011-05-06       NaN       NaN
    2011-05-07  5.112222  1.120848
    2011-05-08  6.349722  1.413897
    2011-05-09  7.270833  1.573199
    2011-05-10  5.997778  1.249120
    2011-05-11  5.685556  1.264841
    2011-05-12  7.153333  1.478244
    2011-05-13  5.949444  1.306350
    2011-05-14  6.446944  1.415302
    2011-05-15  5.958333  1.275853
    2011-05-16  6.801944  1.501816
    2011-05-17  5.836389  1.342787
    2011-05-18  5.254444  1.164683
    2011-05-19  6.234444  1.397851
    2011-05-20  5.814444  1.265143
    2011-05-21  6.738333  1.498687
    2011-05-22  9.308056  2.042271
    2011-05-23  6.127778  1.394619
    2011-05-24       NaN       NaN

    That looks sensible!  Now, let's find out why the cause of the NaNs by 
    setting verbose=True:
    
    >>> usage_per_period(fridge, 'D', on_power_threshold=100, verbose=True)

    Insufficient samples for 2011-04-18; n samples = 13652; dropout_rate = 52.60%
                     start = 2011-04-18 09:22:13-04:00
                       end = 2011-04-18 23:59:57-04:00
    Insufficient samples for 2011-05-03; n samples = 16502; dropout_rate = 42.70%
                     start = 2011-05-03 00:00:03-04:00
                       end = 2011-05-03 17:33:17-04:00
    No data available for    2011-05-04
    No data available for    2011-05-05
    Insufficient samples for 2011-05-06; n samples = 12465; dropout_rate = 56.72%
                     start = 2011-05-06 10:51:50-04:00
                       end = 2011-05-06 23:59:58-04:00
    Insufficient samples for 2011-05-24; n samples = 13518; dropout_rate = 53.06%
                     start = 2011-05-24 00:00:02-04:00
                       end = 2011-05-24 15:56:34-04:00
    Out[209]: 
                hours_on       kwh
    2011-04-18       NaN       NaN
    2011-04-19  5.036111  1.104083
    2011-04-20  5.756667  1.293223
    ...

    Ah, OK, there are insufficient samples for the periods with NaNs.  We could
    set max_dropout_rate to a number closer to 1, but that would give us data
    for days where there isn't much data for that day.

    """

    # TODO: replace this evil hack to handle dataframes(!)
    if isinstance(series, pd.DataFrame):
        series = series.icol(0)

    assert(0 <= max_dropout_rate <= 1)

    period_range, boundaries = _indicies_of_periods(series.index, freq)
    name = str(series.name)
    hours_on_series = pd.Series(index=period_range, dtype=np.float,
                                name=name + ' hours on')
    energy_series = pd.Series(index=period_range, dtype=np.float,
                              name=name + ' ' + energy_unit)

    MAX_SAMPLES_PER_PERIOD = (secs_per_period_alias(freq) / 
                              get_sample_period(series))
    MIN_SAMPLES_PER_PERIOD = (MAX_SAMPLES_PER_PERIOD *
                              (1 - max_dropout_rate))

    for period in period_range:
        try:
            period_start_i, period_end_i = boundaries[period]
        except KeyError:
            if verbose:
                print("No data available for   ",
                      period.strftime('%Y-%m-%d'))
            continue

        data_for_period = series[period_start_i:period_end_i]
        if data_for_period.size < MIN_SAMPLES_PER_PERIOD:
            if verbose:
                dropout_rate = (1 - (data_for_period.size /
                                     MAX_SAMPLES_PER_PERIOD))
                print("Insufficient samples for ",
                      period.strftime('%Y-%m-%d'),
                      "; n samples = ", data_for_period.size,
                      "; dropout_rate = {:.2%}".format(dropout_rate), sep='')
                print("                 start =", data_for_period.index[0])
                print("                   end =", data_for_period.index[-1])
            continue

        hours_on_series[period] = hours_on(data_for_period,
                                           on_power_threshold=on_power_threshold)
        energy_series[period] = energy(data_for_period, unit=energy_unit)

    return pd.DataFrame({'hours_on': hours_on_series,
                         energy_unit: energy_series})