def main(): # set up logging logger = logging.getLogger('store_data_locally') logger.setLevel(logging.DEBUG) fh = logging.FileHandler('store_data_locally.log') fh.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) # get arguments parser = argparse.ArgumentParser( description='Query sensor readings from the IDEAL database' 'and store locally.') parser.add_argument('--dataset_path', help='directory of the original IDEAL dataset') parser.add_argument('--data_path', default=LOCAL_DATA_DIR, help='directory to store data') args = parser.parse_args() # store metadata locally converter = IdealCSV2Hdf5(args.dataset_path, data_dir=args.data_path) converter.store_metadata() with MetaDataStore(data_dir=args.data_path) as s: metadata = MetaData(s) # get relevant sensorids sensors = metadata.sensor_merged() indices = pd.Series([False] * sensors.shape[0], index=sensors.index.copy()) indices = indices | sensors.sensorid.isin(metadata.electric_sensors()) indices = indices & sensors.homeid.astype(int).isin(metadata.gold_homes()) sensorids = sensors.sensorid[indices] sensorids_to_store = sensorids print('Query and store readings from {0} sensors'.format( len(sensorids_to_store))) for idx, sensorid in enumerate(sensorids_to_store): converter = IdealCSV2Hdf5(args.dataset_path, data_dir=args.data_path) logger.info('({0}/{1}) Sensorid: {2}'.format(idx + 1, len(sensorids_to_store), sensorid)) converter.store_readings(sensorid) # try and read stored data readings_store = ReadingDataStore(data_dir=args.data_path) readings_count = 0 for idx, sensorid in enumerate(sensorids): readings = readings_store.get_sensor_readings(sensorid) readings_count += len(readings) logger.info('Total readings : {0}'.format(readings_count))
def get_home_readings(self, homeid, merge_mains_clamps=True, oem_mains_readings=True, unusable_sensors=None, appliance_readings=True, cutoff_date=None): """ get processed and merged readings from locally stored reading data. Must run store_gold_elec_data_locally.py before calling this method :param homeid: int homeid of the home for which to retrive readings :return: DataFrame processed readings for electrical mains and appliances """ anomalous_sensors = None if unusable_sensors is None: anomalous_sensors = pd.read_csv( 'anomalous_sensors.csv', dtype={ 'homeid': np.int32, 'sensorid': np.int32, 'notes': str }, parse_dates=['starttime', 'endtime']) unusable_sensors = anomalous_sensors[ (anomalous_sensors.starttime == pd.NaT) & (anomalous_sensors.endtime == pd.NaT)].sensorid.values # get metadata and readings store with MetaDataStore() as s: metadata = MetaData(s) reading_store = ReadingDataStore() duplicated_sensors = [ u for v in self.sensors_to_merge.values() for u in v ] sensors = metadata.sensor_merged() indices = sensors['sensorid'].isin(reading_store.get_sensorids())\ & (sensors['homeid'] == homeid)\ & ~sensors.sensorid.isin(duplicated_sensors) indices = indices & ~sensors.sensorid.isin(unusable_sensors) sensors = sensors.loc[indices] # get sensorids mains_30A_sensorid, mains_100A_sensorid = [ sensors.sensorid[sensors.sensorid.isin(ids)] for ids in [ metadata.mains_30A_rms_sensors(), metadata.mains_100A_rms_sensors() ] ] dummy_readings = pd.DataFrame( columns=['time', 'value', 'tenths_seconds_since_last_reading']) dummy_readings['time'] = dummy_readings['time'].astype( 'datetime64[ns]') # get apparent power readings mains_30A_readings, mains_100A_readings = [ self.get_sensor_readings( int(sid), reading_store.get_sensor_readings, anomalous_sensors) if (sid.shape[0] == 1) else dummy_readings.copy() for sid in [mains_30A_sensorid, mains_100A_sensorid] ] if cutoff_date is not None: mains_30A_readings, mains_100A_readings = [ readings[readings.time > cutoff_date] for readings in [mains_30A_readings, mains_100A_readings] ] readings_processed = self.process_mains_clamp(mains_30A_readings, mains_100A_readings, merge=merge_mains_clamps) del mains_30A_readings, mains_100A_readings oem_sensors = [] if appliance_readings: # get oem and zwave appliance readings oem_appliances = metadata.appliance_oem_sensors() indices = oem_appliances.sensorid.isin(sensors.sensorid) oem_appliances = oem_appliances[indices] oem_sensors.extend(list(oem_appliances.appliancetype.values)) zwave_appliances = metadata.appliance_zwave_sensors() indices = zwave_appliances.sensorid.isin(sensors.sensorid) zwave_appliances = zwave_appliances[indices] for appliances, readings_processor in zip( [oem_appliances, zwave_appliances], [ self.process_oem_appliance_readings, self.process_zwave_readings ]): for index, row in appliances.iterrows(): readings = self.get_sensor_readings( int(row.sensorid), reading_store.get_sensor_readings, anomalous_sensors) if cutoff_date is not None: readings = readings[readings.time > cutoff_date] readings = readings_processor(readings) readings.rename(columns={'power': row.appliancetype}, inplace=True) # merge multiple appliances of same type if row.appliancetype in readings_processed.keys(): readings_processed[row.appliancetype] = \ readings_processed[row.appliancetype] + \ readings[row.appliancetype] readings_processed[row.appliancetype].fillna( readings[row.appliancetype]) else: readings_processed = readings_processed.join( readings, how='left') del readings gc.collect() if oem_mains_readings: # get oem mains readings mains_oem_sensorid = sensors.sensorid[sensors.sensorid.isin( metadata.mains_oem_sensors())] if len(mains_oem_sensorid) == 1: mains_oem_readings = self.get_sensor_readings(int(mains_oem_sensorid), reading_store.get_sensor_readings, anomalous_sensors) \ if (mains_oem_sensorid.shape[0] == 1) else dummy_readings.copy() if cutoff_date is not None: mains_oem_readings = mains_oem_readings[ mains_oem_readings.time > cutoff_date] mains_oem_readings = self.process_oem_mains_readings( mains_oem_readings) readings_processed = readings_processed.join( mains_oem_readings, how='left') del mains_oem_readings readings_processed.rename(columns={'power': 'mains_real'}, inplace=True) oem_sensors.append('mains_real') # replace OEM flatlines with NAN if readings_processed.shape[0] > 0: oem_flat_periods = self.find_oem_flatline( readings_processed.mains_real) for start_time, period in oem_flat_periods.iterrows(): end_time = start_time + period.duration readings_processed.loc[start_time:end_time, oem_sensors] = np.nan readings_processed.loc[readings_processed.mains_real.isnull(), oem_sensors] = np.NaN # close files reading_store.close() return readings_processed