def hours_on(series, on_power_threshold=DEFAULT_ON_POWER_THRESHOLD): """Returns a float representing the number of hours this channel has been above threshold. If input data has gaps then pre-process data with `insert_zeros` before sending it to this function. Parameters ---------- series : pandas.Series on_power_threshold : float or int, optional, default = 5 Threshold which defines the distinction between "on" and "off". Watts. Returns ------- hours_above_threshold : float See Also -------- kwh joules """ i_above_threshold = np.where(series[:-1] >= on_power_threshold)[0] # now calculate timedelta ('td') above threshold... td_above_thresh = (series.index[i_above_threshold + 1].values - series.index[i_above_threshold].values) secs_on = timedelta64_to_secs(td_above_thresh.sum()) return secs_on / SEC_PER_HOUR
def contiguous_blocks(datetimeindex): sample_period = get_sample_period(datetimeindex) time_delta = timedelta64_to_secs(np.diff(datetimeindex.values)) breaks = time_delta > sample_period if np.sum(breaks) == 0: # All contiguous data contiguous_time_tuples = [(datetimeindex[0], datetimeindex[-1])] # Data has breaks else: break_indices_int = np.where(breaks)[0] contiguous_time_tuples = [] start = 0 for end in break_indices_int: contiguous_time_tuples.append((datetimeindex[start], datetimeindex[end])) start = end + 1 # Appending last block contiguous_time_tuples.append((datetimeindex[start], datetimeindex[-1])) return contiguous_time_tuples
def energy(series, unit='kwh'): """Returns a float representing the quantity of energy this channel consumed. If input data has gaps then pre-process data with `insert_zeros` before sending it to this function. Parameters ---------- series : pd.Series or pd.DataFrame unit : {'kwh', 'joules'} Returns ------- _energy : float See Also -------- hours_on """ # TODO: replace this evil hack to handle dataframes(!) if isinstance(series, pd.DataFrame): series = series.icol(0) timedelta = np.diff(series.index.values) timedelta_secs = timedelta64_to_secs(timedelta) joules = (timedelta_secs * series.values[:-1]).sum() if unit == 'kwh': JOULES_PER_KWH = 3600000 _energy = joules / JOULES_PER_KWH elif unit == 'joules': _energy = joules else: raise ValueError('unrecognised value for `unit`.') return _energy
def get_gap_starts_and_gap_ends(data, max_sample_period, window_start=None, window_end=None): """ Parameters --------- data : pandas.DataFrame or Series or DatetimeIndex max_sample_period : int or float Maximum allowed sample period in seconds. This defines what counts as a 'gap'. window_start, window_end : pd.Timestamp The start and end of the window of interest. If this window is larger than the duration of `data` then gaps will be appended to the front / back as necessary. If this window is shorter than the duration of `data` data will be cropped. Returns ------- gap_starts, gap_ends: DatetimeIndex """ # TODO: this might be a rather nasty hack to fix the circular dependency from nilmtk.preprocessing.electricity.single import reframe_index try: data = data.dropna() except AttributeError: # if data is DatetimeIndex then it has no `dropna()` method pass index = _get_index(data) index = reframe_index(index, window_start, window_end) timedeltas_sec = timedelta64_to_secs(np.diff(index.values)) overlong_timedeltas = timedeltas_sec > max_sample_period gap_starts = index[:-1][overlong_timedeltas] gap_ends = index[1:][overlong_timedeltas] return gap_starts, gap_ends
def insert_zeros(single_appliance_dataframe, sample_period_multiplier=4, round_sample_period=True): """Find all gaps in `single_appliance_dataframe` longer than `max_sample_period` and insert a zero 1 sample period after the start of the gap and insert a second zero 1 sample period before the end of the gap. In other words: "book-end" the gap with a zero at each end. Zeros are only inserted at the start of the gap if the gap starts with a reading above zero; and likewise for insertion of zeros at the end of the gap. Note that this function does not fill the entire gap with zeros, if you want that then try pandas.DataFrame.fillna What is `insert_zeros` useful for? There are two possible reasons for lost samples in individual appliance data: 1) a broken IAM (hence we do not have any information about the appliance) 2) the IAM and appliance have been unplugged (hence we can infer that the appliance is off) Only the user who can decide which of these two assumptions best fits their data. insert_zeros is applicable only in case 2. For example, say a hoover's IAM is permanently attached to the hoover's power cord, even when the hoover is unplugged and put away in the cupboard. Say the hoover was switched on when both the hoover and the hoover's IAM were unplugged. This would result in the dataset having a gap immediately after an on-segment. This combination of an on-segment followed (without any zeros) by a gap might confuse downstream statistics and disaggregation functions which assume that the power drawn by an appliance between reading[i] and reading[i+1] is held constant at reading[i] watts. TODO: a smarter version of this function might use information from the aggregate data to do a better job of estimating exactly when the appliance was turned off. Parameters ---------- single_appliance_dataframe : pandas.DataFrame Data from a single appliance. max_sample_period : float or int, optional sample_period_multiplier : float or int, optional default = 4. Must be 4 or larger (to ensure we do not add zeros less than sample_period seconds apart). max_sample_period = sample_period x sample_period_multiplier. max_sample_period is the maximum permissible sample period (in seconds). Any gap longer than `max_sample_period` is assumed to imply that the IAM and appliance are off. round_sample_period : bool, optional default = True. Whether or not to round sample_period to the nearest int. Returns ------- df_with_zeros : pandas.DataFrame A copy of `single_appliance_dataframe` with zeros inserted `max_sample_period` seconds after the last sample of each on-segment. """ sample_period = get_sample_period(single_appliance_dataframe) if round_sample_period: sample_period = int(round(sample_period)) max_sample_period = sample_period * sample_period_multiplier # Drop NaNs (because we want those to be gaps in the index) df = single_appliance_dataframe.dropna() # Get the length of time between each pair of consecutive samples. Seconds. timedeltas = np.diff(df.index.values) / np.timedelta64(1, "s") gaps_mask = timedeltas > max_sample_period readings_before_gaps = df[:-1][gaps_mask] readings_after_gaps = df[1:][gaps_mask] # we only add a 0 if the recorded value just before the gap is > 0 readings_before_gaps = readings_before_gaps[readings_before_gaps.sum(axis=1) > 0] readings_after_gaps = readings_after_gaps[readings_after_gaps.sum(axis=1) > 0] # Find dates to insert zeros dates_to_insert_zeros_before_gaps = readings_before_gaps.index + pd.DateOffset(seconds=sample_period) dates_to_insert_zeros_after_gaps = readings_after_gaps.index - pd.DateOffset(seconds=sample_period) dates_to_insert_zeros = dates_to_insert_zeros_before_gaps.append(dates_to_insert_zeros_after_gaps) # Columns containing power power_columns = [] non_power_columns = [] for col in df.columns: try: physical_quantity = col.physical_quantity except AttributeError: # DualSupply physical_quantity = col.measurement.physical_quantity if physical_quantity == "power": power_columns.append(col) else: non_power_columns.append(col) # Don't insert duplicate indicies # TODO: remove this assert when we're confident the code is correct assert (dates_to_insert_zeros & df.index).size == 0 # Create new dataframe of zeros at new indicies ready for insertion zeros = pd.DataFrame(data=0, index=dates_to_insert_zeros, columns=power_columns, dtype=np.float32) # Check no zeros are closer than sample_period # TODO: remove this assert when we're confident the code is correct # also remove the sort_index(). if len(zeros) > 1: zeros = zeros.sort_index() assert timedelta64_to_secs(np.diff(zeros.index.values).min()) > sample_period # Now, take median of non-power columns (like voltage) for measurement in non_power_columns: zeros[measurement] = single_appliance_dataframe[measurement].median() # Insert the dataframe of zeros into the data. df_with_zeros = deepcopy(single_appliance_dataframe) df_with_zeros = df_with_zeros.append(zeros) df_with_zeros = df_with_zeros.sort_index() # If input data had a regular frequency then resample # because appending turns off the regular frequency. original_freq = single_appliance_dataframe.index.freq if original_freq is not None: df_with_zeros = df_with_zeros.resample(rule=original_freq) return df_with_zeros
def _get_good_sections(df, sample_period): """ Code copied from nilmtk[1]/nilmtk/stats/goodsections.py [1] https://github.com/nilmtk/nilmtk/ """ index = df.dropna().sort_index().index df_time_end = df.index[-1] + pd.Timedelta(seconds=sample_period) del df if len(index) < 2: return [] timedeltas_sec = timedelta64_to_secs(np.diff(index.values)) timedeltas_check = timedeltas_sec <= sample_period # Memory management del timedeltas_sec gc.collect() timedeltas_check = np.concatenate( [[False], timedeltas_check]) transitions = np.diff(timedeltas_check.astype(np.int)) # Memory management last_timedeltas_check = timedeltas_check[-1] del timedeltas_check gc.collect() good_sect_starts = list(index[:-1][transitions == 1]) good_sect_ends = list(index[:-1][transitions == -1]) # Memory management last_index = index[-1] del index gc.collect() # Work out if this chunk ends with an open ended good section if len(good_sect_ends) == 0: ends_with_open_ended_good_section = ( len(good_sect_starts) > 0) elif len(good_sect_starts) > 0: # We have good_sect_ends and good_sect_starts ends_with_open_ended_good_section = ( good_sect_ends[-1] < good_sect_starts[-1]) else: # We have good_sect_ends but no good_sect_starts ends_with_open_ended_good_section = False if ends_with_open_ended_good_section: good_sect_ends += [df_time_end] assert len(good_sect_starts) == len(good_sect_ends) sections = [TimeFrame(start, end) for start, end in zip(good_sect_starts, good_sect_ends) if not (start == end and start is not None)] # Memory management del good_sect_starts del good_sect_ends gc.collect() return sections