def mask_appliances_with_mains(electricity, sample_period_multiplier=4): """Finds gaps in first mains channel and then removes these gaps from all appliance data. The assumption is that if the mains channel is dead for any timeslice then we should ignore this timeslice for all appliance channels too. Parameters ---------- electricity : Electricity object sample_period_multiplier : int, optional Default = 4 max_sample_period = sample_period x sample_period_multiplier max_sample_period defines a 'gap'. Returns ------- copy of electricity .. warning:: currently only uses gaps from first mains dataframe and ignores all other mains dataframes. """ # TODO: handle multiple mains channels and take intersection of gaps print("Masking appliances with mains... may take a little while...", end='') sys.stdout.flush() mains = electricity.mains.values()[0] max_sample_period = get_sample_period(mains) * sample_period_multiplier print("Mains sample period = {:.1f}, max_sample_period = {:.1f}" .format(get_sample_period(mains), max_sample_period)) print("Getting gap starts and ends...") gap_starts, gap_ends = get_gap_starts_and_gap_ends(mains, max_sample_period) print("Found {:d} gap starts and {:d} gap ends.".format(len(gap_starts), len(gap_ends))) def mask_appliances(appliance_df): """For each appliance dataframe, insert NaNs for any reading inside mains gaps. """ print(".", end='') sys.stdout.flush() for gap_start, gap_end in zip(gap_starts, gap_ends): index = appliance_df.index try: appliance_df[(index >= gap_start) & (index <= gap_end)] = np.NaN except ValueError: # some DFs are int32, which can't accept NaNs, so convert to float32: # TODO: remove this once #105 is fixed appliance_df = appliance_df.astype(np.float32) appliance_df[(index >= gap_start) & (index <= gap_end)] = np.NaN return appliance_df masked = apply_func_to_values_of_dicts(electricity, mask_appliances, ['appliances']) print("done") return masked
def train(self, building, aggregate='mains', submetered='appliances', disagg_features=[Measurement('power', 'active')], environmental=None): """Train using 1d FHMM. Places the learnt model in `model` attribute """ # Get a dataframe of appliances; Since the algorithm is 1D, we need # only the first Measurement train_appliances = building.utility.electric.get_dataframe_of_appliances( measurement=disagg_features[0]) train_mains = building.utility.electric.get_dataframe_of_mains( measurement=disagg_features[0]) # Setting frequency self.freq = str(int(get_sample_period(train_mains.index))) + 's' learnt_model = OrderedDict() for appliance in train_appliances: print(appliance) learnt_model[appliance] = hmm.GaussianHMM( 2, "full") # Data to fit X = [] # Breaking data into contiguous blocks for start, end in contiguous_blocks(train_mains.index): #print(start, end) length = train_appliances[appliance][start:end].values.size # print(length) # Ignore small sequences if length > 50: temp = train_appliances[appliance][ start:end].values.reshape(length, 1) X.append(temp) # print(X) # Fit learnt_model[appliance].fit(X) # Combining to make a AFHMM new_learnt_models = OrderedDict() for appliance in learnt_model: startprob, means, covars, transmat = sort_learnt_parameters( learnt_model[appliance].startprob_, learnt_model[appliance].means_, learnt_model[appliance].covars_, learnt_model[appliance].transmat_) new_learnt_models[appliance] = hmm.GaussianHMM( startprob.size, "full", startprob, transmat) new_learnt_models[appliance].means_ = means new_learnt_models[appliance].covars_ = covars learnt_model_combined = create_combined_hmm(new_learnt_models) self.individual = new_learnt_models self.model = learnt_model_combined
def contiguous_blocks(datetimeindex): sample_period = get_sample_period(datetimeindex) time_delta = timedelta64_to_secs(np.diff(datetimeindex.values)) breaks = time_delta > sample_period if np.sum(breaks) == 0: # All contiguous data contiguous_time_tuples = [(datetimeindex[0], datetimeindex[-1])] # Data has breaks else: break_indices_int = np.where(breaks)[0] contiguous_time_tuples = [] start = 0 for end in break_indices_int: contiguous_time_tuples.append((datetimeindex[start], datetimeindex[end])) start = end + 1 # Appending last block contiguous_time_tuples.append((datetimeindex[start], datetimeindex[-1])) return contiguous_time_tuples
def insert_zeros(single_appliance_dataframe, sample_period_multiplier=4, round_sample_period=True): """Find all gaps in `single_appliance_dataframe` longer than `max_sample_period` and insert a zero 1 sample period after the start of the gap and insert a second zero 1 sample period before the end of the gap. In other words: "book-end" the gap with a zero at each end. Zeros are only inserted at the start of the gap if the gap starts with a reading above zero; and likewise for insertion of zeros at the end of the gap. Note that this function does not fill the entire gap with zeros, if you want that then try pandas.DataFrame.fillna What is `insert_zeros` useful for? There are two possible reasons for lost samples in individual appliance data: 1) a broken IAM (hence we do not have any information about the appliance) 2) the IAM and appliance have been unplugged (hence we can infer that the appliance is off) Only the user who can decide which of these two assumptions best fits their data. insert_zeros is applicable only in case 2. For example, say a hoover's IAM is permanently attached to the hoover's power cord, even when the hoover is unplugged and put away in the cupboard. Say the hoover was switched on when both the hoover and the hoover's IAM were unplugged. This would result in the dataset having a gap immediately after an on-segment. This combination of an on-segment followed (without any zeros) by a gap might confuse downstream statistics and disaggregation functions which assume that the power drawn by an appliance between reading[i] and reading[i+1] is held constant at reading[i] watts. TODO: a smarter version of this function might use information from the aggregate data to do a better job of estimating exactly when the appliance was turned off. Parameters ---------- single_appliance_dataframe : pandas.DataFrame Data from a single appliance. max_sample_period : float or int, optional sample_period_multiplier : float or int, optional default = 4. Must be 4 or larger (to ensure we do not add zeros less than sample_period seconds apart). max_sample_period = sample_period x sample_period_multiplier. max_sample_period is the maximum permissible sample period (in seconds). Any gap longer than `max_sample_period` is assumed to imply that the IAM and appliance are off. round_sample_period : bool, optional default = True. Whether or not to round sample_period to the nearest int. Returns ------- df_with_zeros : pandas.DataFrame A copy of `single_appliance_dataframe` with zeros inserted `max_sample_period` seconds after the last sample of each on-segment. """ sample_period = get_sample_period(single_appliance_dataframe) if round_sample_period: sample_period = int(round(sample_period)) max_sample_period = sample_period * sample_period_multiplier # Drop NaNs (because we want those to be gaps in the index) df = single_appliance_dataframe.dropna() # Get the length of time between each pair of consecutive samples. Seconds. timedeltas = np.diff(df.index.values) / np.timedelta64(1, "s") gaps_mask = timedeltas > max_sample_period readings_before_gaps = df[:-1][gaps_mask] readings_after_gaps = df[1:][gaps_mask] # we only add a 0 if the recorded value just before the gap is > 0 readings_before_gaps = readings_before_gaps[readings_before_gaps.sum(axis=1) > 0] readings_after_gaps = readings_after_gaps[readings_after_gaps.sum(axis=1) > 0] # Find dates to insert zeros dates_to_insert_zeros_before_gaps = readings_before_gaps.index + pd.DateOffset(seconds=sample_period) dates_to_insert_zeros_after_gaps = readings_after_gaps.index - pd.DateOffset(seconds=sample_period) dates_to_insert_zeros = dates_to_insert_zeros_before_gaps.append(dates_to_insert_zeros_after_gaps) # Columns containing power power_columns = [] non_power_columns = [] for col in df.columns: try: physical_quantity = col.physical_quantity except AttributeError: # DualSupply physical_quantity = col.measurement.physical_quantity if physical_quantity == "power": power_columns.append(col) else: non_power_columns.append(col) # Don't insert duplicate indicies # TODO: remove this assert when we're confident the code is correct assert (dates_to_insert_zeros & df.index).size == 0 # Create new dataframe of zeros at new indicies ready for insertion zeros = pd.DataFrame(data=0, index=dates_to_insert_zeros, columns=power_columns, dtype=np.float32) # Check no zeros are closer than sample_period # TODO: remove this assert when we're confident the code is correct # also remove the sort_index(). if len(zeros) > 1: zeros = zeros.sort_index() assert timedelta64_to_secs(np.diff(zeros.index.values).min()) > sample_period # Now, take median of non-power columns (like voltage) for measurement in non_power_columns: zeros[measurement] = single_appliance_dataframe[measurement].median() # Insert the dataframe of zeros into the data. df_with_zeros = deepcopy(single_appliance_dataframe) df_with_zeros = df_with_zeros.append(zeros) df_with_zeros = df_with_zeros.sort_index() # If input data had a regular frequency then resample # because appending turns off the regular frequency. original_freq = single_appliance_dataframe.index.freq if original_freq is not None: df_with_zeros = df_with_zeros.resample(rule=original_freq) return df_with_zeros