Example #1
0
def hours_on(series, on_power_threshold=DEFAULT_ON_POWER_THRESHOLD):
    """Returns a float representing the number of hours this channel
    has been above threshold.

    If input data has gaps then pre-process data with `insert_zeros`
    before sending it to this function.

    Parameters
    ----------
    series : pandas.Series

    on_power_threshold : float or int, optional, default = 5
        Threshold which defines the distinction between "on" and "off".  Watts.

    Returns
    -------
    hours_above_threshold : float

    See Also
    --------
    kwh
    joules
    """

    i_above_threshold = np.where(series[:-1] >= on_power_threshold)[0]
    # now calculate timedelta ('td') above threshold...
    td_above_thresh = (series.index[i_above_threshold + 1].values -
                       series.index[i_above_threshold].values)
    secs_on = timedelta64_to_secs(td_above_thresh.sum())
    return secs_on / SEC_PER_HOUR
Example #2
0
def contiguous_blocks(datetimeindex):
    sample_period = get_sample_period(datetimeindex)
    time_delta = timedelta64_to_secs(np.diff(datetimeindex.values))
    breaks = time_delta > sample_period
    if np.sum(breaks) == 0:
        # All contiguous data
        contiguous_time_tuples = [(datetimeindex[0], datetimeindex[-1])]
    # Data has breaks
    else:
        break_indices_int = np.where(breaks)[0]
        contiguous_time_tuples = []
        start = 0
        for end in break_indices_int:
            contiguous_time_tuples.append((datetimeindex[start], datetimeindex[end]))
            start = end + 1
        # Appending last block
        contiguous_time_tuples.append((datetimeindex[start], datetimeindex[-1]))
    return contiguous_time_tuples
Example #3
0
def energy(series, unit='kwh'):
    """Returns a float representing the quantity of energy this 
    channel consumed.

    If input data has gaps then pre-process data with `insert_zeros`
    before sending it to this function.

    Parameters
    ----------
    series : pd.Series or pd.DataFrame

    unit : {'kwh', 'joules'}

    Returns
    -------
    _energy : float

    See Also
    --------
    hours_on
    """

    # TODO: replace this evil hack to handle dataframes(!)
    if isinstance(series, pd.DataFrame):
        series = series.icol(0)

    timedelta = np.diff(series.index.values)
    timedelta_secs = timedelta64_to_secs(timedelta)
    joules = (timedelta_secs * series.values[:-1]).sum()

    if unit == 'kwh':
        JOULES_PER_KWH = 3600000
        _energy = joules / JOULES_PER_KWH
    elif unit == 'joules':
        _energy = joules
    else:
        raise ValueError('unrecognised value for `unit`.')

    return _energy
Example #4
0
def get_gap_starts_and_gap_ends(data, max_sample_period, 
                                window_start=None, window_end=None):
    """
    Parameters
    ---------
    data : pandas.DataFrame or Series or DatetimeIndex

    max_sample_period : int or float
        Maximum allowed sample period in seconds.  This defines what
        counts as a 'gap'.

    window_start, window_end : pd.Timestamp
        The start and end of the window of interest.  If this window
        is larger than the duration of `data` then gaps will be
        appended to the front / back as necessary.  If this window
        is shorter than the duration of `data` data will be cropped.

    Returns
    -------
    gap_starts, gap_ends: DatetimeIndex
    """
    # TODO: this might be a rather nasty hack to fix the circular dependency
    from nilmtk.preprocessing.electricity.single import reframe_index

    try:
        data = data.dropna()
    except AttributeError:
        # if data is DatetimeIndex then it has no `dropna()` method
        pass
    
    index = _get_index(data)
    index = reframe_index(index, window_start, window_end)
    timedeltas_sec = timedelta64_to_secs(np.diff(index.values))
    overlong_timedeltas = timedeltas_sec > max_sample_period
    gap_starts = index[:-1][overlong_timedeltas]
    gap_ends = index[1:][overlong_timedeltas]        

    return gap_starts, gap_ends
Example #5
0
def insert_zeros(single_appliance_dataframe, sample_period_multiplier=4, round_sample_period=True):
    """Find all gaps in `single_appliance_dataframe` longer than
    `max_sample_period` and insert a zero 1 sample period after
    the start of the gap and insert a second zero 1 sample period
    before the end of the gap.

    In other words: "book-end" the gap with a zero at each end.

    Zeros are only inserted at the start of the gap if the gap
    starts with a reading above zero; and likewise for insertion
    of zeros at the end of the gap.

    Note that this function does not fill the entire gap with zeros,
    if you want that then try pandas.DataFrame.fillna

    What is `insert_zeros` useful for?

    There are two possible reasons for lost samples in individual
    appliance data: 

    1) a broken IAM (hence we do not have any information about the appliance)
    2) the IAM and appliance have been unplugged (hence we can infer that the
       appliance is off)

    Only the user who can decide which of these two assumptions best
    fits their data.  insert_zeros is applicable only in case 2.

    For example, say a hoover's IAM is permanently attached to the
    hoover's power cord, even when the hoover is unplugged and put
    away in the cupboard.

    Say the hoover was switched on when both the hoover and the
    hoover's IAM were unplugged.  This would result in the dataset
    having a gap immediately after an on-segment.  This combination of
    an on-segment followed (without any zeros) by a gap might confuse
    downstream statistics and disaggregation functions which assume
    that the power drawn by an appliance between reading[i] and
    reading[i+1] is held constant at reading[i] watts.

    TODO: a smarter version of this function might use information from
    the aggregate data to do a better job of estimating exactly when
    the appliance was turned off.

    Parameters
    ----------
    single_appliance_dataframe : pandas.DataFrame
        Data from a single appliance.

    max_sample_period : float or int, optional

    sample_period_multiplier : float or int, optional 
        default = 4.  Must be 4 or larger (to ensure we do not add zeros
        less than sample_period seconds apart).
        max_sample_period = sample_period x sample_period_multiplier.
        max_sample_period is the maximum permissible sample period (in
        seconds). Any gap longer than `max_sample_period` is assumed
        to imply that the IAM and appliance are off.

    round_sample_period : bool, optional
        default = True. Whether or not to round sample_period to the 
        nearest int.

    Returns
    -------
    df_with_zeros : pandas.DataFrame
        A copy of `single_appliance_dataframe` with zeros inserted 
        `max_sample_period` seconds after the last sample of each on-segment.

    """
    sample_period = get_sample_period(single_appliance_dataframe)
    if round_sample_period:
        sample_period = int(round(sample_period))

    max_sample_period = sample_period * sample_period_multiplier

    # Drop NaNs (because we want those to be gaps in the index)
    df = single_appliance_dataframe.dropna()

    # Get the length of time between each pair of consecutive samples. Seconds.
    timedeltas = np.diff(df.index.values) / np.timedelta64(1, "s")
    gaps_mask = timedeltas > max_sample_period
    readings_before_gaps = df[:-1][gaps_mask]
    readings_after_gaps = df[1:][gaps_mask]

    # we only add a 0 if the recorded value just before the gap is > 0
    readings_before_gaps = readings_before_gaps[readings_before_gaps.sum(axis=1) > 0]

    readings_after_gaps = readings_after_gaps[readings_after_gaps.sum(axis=1) > 0]

    # Find dates to insert zeros
    dates_to_insert_zeros_before_gaps = readings_before_gaps.index + pd.DateOffset(seconds=sample_period)

    dates_to_insert_zeros_after_gaps = readings_after_gaps.index - pd.DateOffset(seconds=sample_period)

    dates_to_insert_zeros = dates_to_insert_zeros_before_gaps.append(dates_to_insert_zeros_after_gaps)

    # Columns containing power
    power_columns = []
    non_power_columns = []
    for col in df.columns:
        try:
            physical_quantity = col.physical_quantity
        except AttributeError:  # DualSupply
            physical_quantity = col.measurement.physical_quantity
        if physical_quantity == "power":
            power_columns.append(col)
        else:
            non_power_columns.append(col)

    # Don't insert duplicate indicies
    # TODO: remove this assert when we're confident the code is correct
    assert (dates_to_insert_zeros & df.index).size == 0

    # Create new dataframe of zeros at new indicies ready for insertion
    zeros = pd.DataFrame(data=0, index=dates_to_insert_zeros, columns=power_columns, dtype=np.float32)

    # Check no zeros are closer than sample_period
    # TODO: remove this assert when we're confident the code is correct
    # also remove the sort_index().
    if len(zeros) > 1:
        zeros = zeros.sort_index()
        assert timedelta64_to_secs(np.diff(zeros.index.values).min()) > sample_period

    # Now, take median of non-power columns (like voltage)
    for measurement in non_power_columns:
        zeros[measurement] = single_appliance_dataframe[measurement].median()

    # Insert the dataframe of zeros into the data.
    df_with_zeros = deepcopy(single_appliance_dataframe)
    df_with_zeros = df_with_zeros.append(zeros)
    df_with_zeros = df_with_zeros.sort_index()

    # If input data had a regular frequency then resample
    # because appending turns off the regular frequency.
    original_freq = single_appliance_dataframe.index.freq
    if original_freq is not None:
        df_with_zeros = df_with_zeros.resample(rule=original_freq)

    return df_with_zeros
Example #6
0
def _get_good_sections(df, sample_period):
    """
    Code copied from nilmtk[1]/nilmtk/stats/goodsections.py
    
    [1] https://github.com/nilmtk/nilmtk/
    """
    index = df.dropna().sort_index().index
    df_time_end = df.index[-1] + pd.Timedelta(seconds=sample_period)
    del df

    if len(index) < 2:
        return []

    timedeltas_sec = timedelta64_to_secs(np.diff(index.values))
    timedeltas_check = timedeltas_sec <= sample_period

    # Memory management
    del timedeltas_sec
    gc.collect()

    timedeltas_check = np.concatenate(
        [[False],
         timedeltas_check])
    transitions = np.diff(timedeltas_check.astype(np.int))

    # Memory management
    last_timedeltas_check = timedeltas_check[-1]
    del timedeltas_check
    gc.collect()

    good_sect_starts = list(index[:-1][transitions ==  1])
    good_sect_ends   = list(index[:-1][transitions == -1])

    # Memory management
    last_index = index[-1]
    del index
    gc.collect()

    # Work out if this chunk ends with an open ended good section
    if len(good_sect_ends) == 0:
        ends_with_open_ended_good_section = (
            len(good_sect_starts) > 0)
    elif len(good_sect_starts) > 0:
        # We have good_sect_ends and good_sect_starts
        ends_with_open_ended_good_section = (
            good_sect_ends[-1] < good_sect_starts[-1])
    else:
        # We have good_sect_ends but no good_sect_starts
        ends_with_open_ended_good_section = False

    if ends_with_open_ended_good_section:
        good_sect_ends += [df_time_end]

    assert len(good_sect_starts) == len(good_sect_ends)

    sections = [TimeFrame(start, end)
                for start, end in zip(good_sect_starts, good_sect_ends)
                if not (start == end and start is not None)]

    # Memory management
    del good_sect_starts
    del good_sect_ends
    gc.collect()

    return sections