def _convert_one_sec_data(ukdale_path, store, ac_type_map): ids_of_one_sec_data = [ identifier for identifier, ac_types in ac_type_map.iteritems() if ac_types == ['active', 'apparent'] ] if not ids_of_one_sec_data: return for identifier in ids_of_one_sec_data: key = Key(building=identifier[0], meter=identifier[1]) print("Loading 1-second data for", key, "...") house_path = 'house_{:d}'.format(key.building) filename = join(ukdale_path, house_path, 'mains.dat') df = _load_csv(filename, ONE_SEC_COLUMNS, TZ) store.put(store, str(key), df) # Set 'disabled' metadata attributes # TODO: needs to use `nilmtk.DataStore` API rather than grabbing # the `pd.HDFStore` directly. group = store.store._handle.get_node('/building{:d}'.format( key.building)) metadata = group._f_getattr('metadata') metadata['elec_meters'][key.meter]['disabled'] = True group._f_setattr('metadata', metadata) store.store.flush() store.close()
def _convert(input_path, data_store, measurement_mapping_func, sort_index=True, drop_duplicates=False): meter_to_machine = { 1: "MainTerminal", 2: "ChipPress", 3: "ChipSaw", 4: "HighTemperatureOven", 5: "PickAndPlaceUnit", 6: "ScreenPrinter", 7: "SolderingOven", 8: "VacuumOven", 9: "VacuumPump1", 10: "VacuumPump2", 11: "WashingMachine", } check_directory_exists(input_path) print("Loading factory 1...", end="... ") chans = _find_all_channels(input_path, meter_to_machine) for chan_id, filename in chans.items(): print(chan_id, end=" ") stdout.flush() key = Key(building=1, meter=chan_id) measurements = measurement_mapping_func(chan_id) df = _load_csv(filename, measurements, sort_index=sort_index, drop_duplicates=drop_duplicates) data_store.put(str(key), df) print()
def convert_ampds(input_path, output_filename, format='HDF'): """ Convert AMPds R2013 as seen on Dataverse. Download the files as CSVs and put them in the `input_path` folder for conversion. Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO Parameters: ----------- input_path: str The path of the directory where all the csv files are supposed to be stored output_filename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. format: str Defaults to HDF5 Example usage: -------------- convert('/AMPds/electricity', 'store.h5') """ check_directory_exists(input_path) files = [ f for f in listdir(input_path) if isfile(join(input_path, f)) and '.csv' in f and '.swp' not in f ] # Sorting Lexicographically files.sort() # Remove Whole Home and put it at top files.remove("WHE.csv") files.insert(0, "WHE.csv") assert isdir(input_path) store = get_datastore(output_filename, format, mode='w') for i, csv_file in enumerate(files): key = Key(building=1, meter=(i + 1)) print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...') df = pd.read_csv(join(input_path, csv_file)) # Due to fixed width, column names have spaces :( df.columns = [x.replace(" ", "") for x in df.columns] df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit='s', utc=True) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df = df.tz_convert(TIMEZONE) df.rename(columns=lambda x: columnNameMapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) store.put(str(key), df) print("Done with file #", (i + 1)) store.close() metadata_path = join(_get_module_directory(), 'metadata') print('Processing metadata...') convert_yaml_to_hdf5(metadata_path, output_filename)
def _convert(input_path, store, tz, sort_index=True): """ Parameters ---------- input_path : str The root path of the REFIT dataset. store : DataStore The NILMTK DataStore object. measurement_mapping_func : function Must take these parameters: - house_id - chan_id Function should return a list of tuples e.g. [('power', 'active')] tz : str Timezone e.g. 'US/Eastern' sort_index : bool """ check_directory_exists(input_path) # Iterate though all houses and channels # house 14 is missing! houses = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21 ] nilmtk_house_id = 0 for house_id in houses: nilmtk_house_id += 1 print("Loading house", house_id, end="... ") stdout.flush() csv_filename = join(input_path, 'House_' + str(house_id) + '.csv') # The clean version already includes header, so we # just skip the text version of the timestamp usecols = [ 'Unix', 'Aggregate', 'Appliance1', 'Appliance2', 'Appliance3', 'Appliance4', 'Appliance5', 'Appliance6', 'Appliance7', 'Appliance8', 'Appliance9' ] df = _load_csv(csv_filename, usecols, tz) if sort_index: df = df.sort_index() # might not be sorted... chan_id = 0 for col in df.columns: chan_id += 1 print(chan_id, end=" ") stdout.flush() key = Key(building=nilmtk_house_id, meter=chan_id) chan_df = pd.DataFrame(df[col]) chan_df.columns = pd.MultiIndex.from_tuples([('power', 'active')]) # Modify the column labels to reflect the power measurements recorded. chan_df.columns.set_names(LEVEL_NAMES, inplace=True) store.put(str(key), chan_df) print('')
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') any_file_converted = False for building_name, building_mapping in iteritems(overall_dataset_mapping): for load_name, load_mapping in iteritems(building_mapping): for load_mapping_path, meter_number in iteritems(load_mapping): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute) if not os.path.isfile(filename_attribute): # File not found directly in the combed_path provided # Try adding 'iiitd' to it filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute) if os.path.isfile(filename_attribute): exists = True print(filename_attribute) df = pd.read_csv(filename_attribute, names=["timestamp", attribute]) df.index = pd.to_datetime(df["timestamp"], unit='ms') df = df.drop("timestamp", 1) dfs.append(df) else: exists = False if exists: total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns]) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) any_file_converted = True if not any_file_converted: raise RuntimeError('No files converted, did you specify the correct path?') convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting COMBED to HDF5!")
def _process_meter_in_chunk(nilmtk_house_id, meter_id, chunk, store, appliance_code): data = chunk['data'].values index = chunk['datetime'] df = pd.DataFrame(data=data, index=index) df.columns = pd.MultiIndex.from_tuples([('power', 'active')]) # Modify the column labels to reflect the power measurements recorded. df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.sort_index() key = Key(building=nilmtk_house_id, meter=meter_id) store.append(str(key), df)
def _convert(csv_filename, store, tz, sort_index=True): """ Parameters ---------- csv_filename : str The csv_filename that will be loaded. Must end with .csv store : DataStore The NILMTK DataStore object. measurement_mapping_func : function Must take these parameters: - house_id - chan_id Function should return a list of tuples e.g. [('power', 'apparent')] tz : str Timezone e.g. 'Europe/Amsterdam' sort_index : bool """ # Iterate though all houses and channels houses = [1] nilmtk_house_id = 0 for house_id in houses: nilmtk_house_id += 1 print("Loading house", house_id, end="... ") stdout.flush() usecols = [ 'Timestamp', 'mains', 'television', 'fan', 'fridge', 'laptop computer', 'electric heating element', 'oven', 'unknown', 'washing machine', 'microwave', 'toaster', 'sockets', 'cooker' ] df = _load_csv(csv_filename, usecols, 3, tz) if sort_index: df = df.sort_index() # might not be sorted... chan_id = 0 for col in df.columns: chan_id += 1 print(chan_id, end=" ") stdout.flush() key = Key(building=nilmtk_house_id, meter=chan_id) chan_df = pd.DataFrame(df[col]) chan_df.columns = pd.MultiIndex.from_tuples([('power', 'apparent') ]) # Modify the column labels to reflect the power measurements recorded. chan_df.columns.set_names(LEVEL_NAMES, inplace=True) store.put(str(key), chan_df) print('')
def _convert(input_path, store, measurement_mapping_func, tz, sort_index=True, drop_duplicates=False): """ Parameters ---------- input_path : str The root path of the DEPS dataset. store : DataStore The NILMTK DataStore object. measurement_mapping_func : function Must take these parameters: - classroom_id - chan_id Function should return a list of tuples e.g. [('power', 'active')] tz : str Timezone e.g. 'US/Eastern' sort_index : bool Defaults to True drop_duplicates : bool Remove entries with duplicated timestamp (keeps the first value) Defaults to False for backwards compatibility. """ check_directory_exists(input_path) # Iterate though all classrooms and channels classrooms = _find_all_classrooms(input_path) for classroom_id in classrooms: print("Loading data from 'Aula 2.2 Bis' to classroom N°", classroom_id, end=" ... Loading channels ") stdout.flush() chans = _find_all_chans(input_path, classroom_id) for chan_id in chans: print(chan_id, end=" ") stdout.flush() key = Key(building=classroom_id, meter=chan_id) measurements = measurement_mapping_func(classroom_id, chan_id) csv_filename = _get_csv_filename(input_path, key) df = _load_csv(csv_filename, measurements, tz, sort_index=sort_index, drop_duplicates=drop_duplicates) store.put(str(key), df) print()
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') for building_name, building_mapping in iteritems(overall_dataset_mapping): for load_name, load_mapping in iteritems(building_mapping): for load_mapping_path, meter_number in iteritems(load_mapping): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" % attribute) if os.path.isfile(filename_attribute): exists = True print(filename_attribute) df = pd.read_csv(filename_attribute, header=True, names=["timestamp", attribute]) df.index = pd.to_datetime(df["timestamp"], unit='ms') df = df.drop("timestamp", 1) dfs.append(df) else: exists = False if exists: total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.rename(columns=lambda x: column_mapping[x], inplace=True) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting COMBED to HDF5!")
def convert_greend(greend_path, hdf_filename): """ Parameters ---------- greend_path : str The root path of the greend dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') houses = sorted(__get_houses(greend_path)) print(houses) h = 1 for house in houses: print('loading '+house+"'s house...") abs_house = join(greend_path, house) dates = [d for d in listdir(abs_house) if d.startswith('dataset')] house_data = pd.DataFrame() for date in dates: print('-----------------------',date) tmp_pandas = pd.DataFrame.from_csv(join(abs_house, date)) tmp_pandas = tmp_pandas[tmp_pandas.index != 'timestamp'] tmp_pandas = tmp_pandas.sort_index() c = 0 tmp_pandas.index = [__timestamp(t) for t in tmp_pandas.index] house_data = house_data.append(tmp_pandas) #for testing metadata files: #break m = 1 for meter in house_data: print("meter" + str(m)+': ') key = Key(building = h, meter=m) print("Putting into store...") store.put(str(key), house_data[meter], format = 'table') m += 1 print('Flushing store...') store.flush() h += 1 store.close() #needs to be edited convert_yaml_to_hdf5('/path/to/metadata', hdf_filename)
def _convert_one_sec_data(ukdale_path, store, ac_type_map, drop_duplicates): ids_of_one_sec_data = [ identifier for identifier, ac_types in iteritems(ac_type_map) if ac_types == ['active', 'apparent']] if not ids_of_one_sec_data: return for identifier in ids_of_one_sec_data: key = Key(building=identifier[0], meter=identifier[1]) print("Loading 1-second data for", key, "...") house_path = 'house_{:d}'.format(key.building) filename = join(ukdale_path, house_path, 'mains.dat') df = _load_csv(filename, ONE_SEC_COLUMNS, TZ, drop_duplicates=drop_duplicates) store.put(str(key), df) store.close()
def convert_iawe(iawe_path, output_filename, format="HDF"): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. output_filename : str The destination filename (including path and suffix). """ check_directory_exists(iawe_path) idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ) idx = idx.tz_localize('GMT').tz_convert(TIMEZONE) # Open data store store = get_datastore(output_filename, format, mode='w') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 12): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename, dtype=np.float64, na_values='\\N') df.drop_duplicates(subset=["timestamp"], inplace=True) df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) df = df.tz_convert(TIMEZONE) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.columns = pd.MultiIndex.from_tuples( [column_mapping[x] for x in df.columns], names=LEVEL_NAMES) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) df = df.sort_index() df = df.resample("1T").mean() df = reindex_fill_na(df, idx) assert df.isnull().sum().sum() == 0 store.put(str(key), df) store.close() metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe', 'metadata') convert_yaml_to_hdf5(metadata_dir, output_filename) print("Done converting iAWE to HDF5!")
def main(): parser = argparse.ArgumentParser() parser.add_argument('inpath', help='input directory (ANTgen output)', nargs='?', default='../output') parser.add_argument('outfile', help='output file (HDF5 file)', nargs='?', default='../output/ANTgen.h5') args = parser.parse_args() if not os.path.exists('metadata') or not os.path.isfile('metadata/building1.yaml'): print("No metadata found. Please run 'generate_metadata.py' before using this tool...") exit(1) print("Converting ANTgen output from '{}' to file '{}'".format(args.inpath, args.outfile)) with open('metadata/building1.yaml', 'r') as f: yaml_dict = yaml.load(f, Loader=yaml.FullLoader) channel_list = ['total'] # pre-populate with aggregate data (total.csv) for app in yaml_dict['appliances']: channel_list.append(app['original_name']) store = get_datastore(args.outfile, 'HDF', mode='w') for i, app_name in enumerate(channel_list): print("Adding virtual meter ID {:02d}: {}".format(1+i, app_name)) key = Key(building=1, meter=(i + 1)) csvfile = os.path.join(args.inpath, str(app_name)+'.csv') try: df = pd.read_csv(csvfile, sep=';', encoding='utf-8', index_col=0) df.columns = pd.MultiIndex.from_tuples([('power', 'active') for x in df.columns], names=LEVEL_NAMES) df.index = pd.to_datetime(df.index) tz_naive = df.index tz_aware = tz_naive.tz_localize(tz='Europe/Vienna', ambiguous=True, nonexistent=pd.Timedelta('1H')) df.index = tz_aware df = df.tz_convert('Europe/Vienna') store.put(str(key), df) except FileNotFoundError: print("Input file '{}' not found - your HDF5 file will be incomplete!".format(csvfile)) continue print('Adding metadata...') convert_yaml_to_hdf5('metadata/', args.outfile)
def convert_caxe(file_path): ''' Parameters ------------ Takes input csv_file name to be tested as string. Data columns of the csv should contain following the following values in columns: timestamp,reactive_power,apparent_power,current,frequency,voltage,active_power) Converts it into hdf5 Format and save as test.h5. ''' df = pd.read_csv(f'{file_path}', names=['timestamp', 'R', 'A', 'C', 'F', 'V', 'T']) column_mapping = { 'F': ('frequency', ""), 'V': ('voltage', ""), 'T': ('power', 'active'), 'C': ('current', ''), 'R': ('power', 'reactive'), 'A': ('power', 'apparent'), } output_filename = 'test.h5' # Open data store store = get_datastore(output_filename, format='HDF', mode='w') key = Key(building=1, meter=1) print('Loading ', 1) df.index = pd.to_datetime(df.timestamp.values) df = df.tz_convert( TIMEZONE) # if error occurs use tz_localize for tz naive timestamps df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.index = pd.to_datetime(df.index.values) df.columns = pd.MultiIndex.from_tuples( [column_mapping[x] for x in df.columns], names=LEVEL_NAMES) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) df = df.sort_index() df = df.resample("1T").mean() assert df.isnull().sum().sum() == 0 store.put(str(key), df) store.close() convert_yaml_to_hdf5('./metadata', output_filename) print("Done converting test data to HDF5!")
def _convert(input_path, store, tz): """ Parameters ---------- input_path : str The root path of the ENERTALK dataset. store : DataStore The NILMTK DataStore object. tz : str Timezone e.g. 'Asia/Seoul' """ house_list = [ fname for fname in listdir(input_path) if not fname.startswith('.') ] date_count = 0 for house in house_list: date_list = sorted(listdir(join(input_path, house))) date_count += len(date_list) with tqdm(total=date_count) as pbar: for house in house_list: date_list = sorted(listdir(join(input_path, house))) for date in date_list: fname_list = sorted(listdir(join(input_path, house, date))) for fname in fname_list: file_path = join(input_path, house, date, fname) chan_df = _load_parquet(file_path) house_id = int(house) + 1 chan_id = int(fname.split('_')[0]) + 1 key = Key(building=house_id, meter=chan_id) chan_df.columns = pd.MultiIndex.from_tuples([ ('power', 'active'), ('power', 'reactive') ]) chan_df.columns.set_names(LEVEL_NAMES, inplace=True) if str(key) in store._keys(): store.append(str(key), chan_df) else: store.put(str(key), chan_df) pbar.update(1)
def convert(inputPath, hdfFilename, metadataPath='/'): ''' Parameters: ----------- inputPath: str The path of the directory where all the csv files are supposed to be stored hdfFilename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. metadataPath: str The path of the directory where the metadata is stored. By default, it is the root directory. ''' # This function contains the bulk of the code. The test() function can simply be ignored for now # To do: Complete the metadata set. Then the convert_yaml_to_hdf5() function will stop throwing random errors. files = [ f for f in listdir(inputPath) if isfile(join(inputPath, f)) and '.csv' in f and '.swp' not in f ] print(files) assert isdir(inputPath) # print(files) store = HDFStore(hdfFilename) # fp=pd.read_csv(join(inputPath, sent)) for i in range(len(files)): sent = files[i] key = Key(building=1, meter=(i + 1)) print('Loading file #', (i + 1), '. Please wait...') fp = pd.read_csv(join(inputPath, sent)) fp.TS = fp.TS.astype('int') fp.index = pd.to_datetime((fp.TS.values * 1e9).astype(int)) # fp=fp.tz_convert('Asia/Kolkata') fp = fp.drop('TS', 1) fp.rename(columns=lambda x: columnNameMapping[x], inplace=True) fp.columns.set_names(LEVEL_NAMES, inplace=True) fp = fp.convert_objects(convert_numeric=True) fp = fp.dropna() fp = fp.astype(np.float32) store.put(str(key), fp, format='Table') store.flush() print("Done with file #", (i + 1)) store.close()
def convert_combed(combed_path, hdf_filename): """ Parameters ---------- combed_path : str The root path of the combed dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ assert isdir(combed_path) # Open HDF5 file store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') chan = 1 for building, meter_array in SUBMETER_PATHS.iteritems(): for meter in meter_array: key = Key(building=1, meter=chan) dfs = [] total = pd.DataFrame() for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building, str(meter), "%s.csv" % attribute) print(filename_attribute) dfs.append( pd.read_csv(filename_attribute, parse_dates=True, index_col=0, header=True, names=[attribute])) total = pd.concat(dfs, axis=1) total.rename(columns=lambda x: column_mapping[x], inplace=True) total.columns.set_names(LEVEL_NAMES, inplace=True) store.put(str(key), total, format='table') store.flush() chan = chan + 1 convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), hdf_filename) print("Done converting COMBED to HDF5!")
def _convert(connection, dest_file, start_date, end_date, tz, sort_index=True): """ Parameters ---------- connection: Connection Connection to the DEDDIAG database dest_file : DataStore The NILMTK DataStore object tz : str Timezone e.g. 'Europe/Berlin' sort_index : bool Defaults to True """ print(f"Loading house {house_nr}", end="... ") stdout.flush() # Find all houses and channels for channel in channels: print(f"{channel}", end=" ") stdout.flush() measurements = MeasurementsExpanded(channel, start_date, end_date).request(connection) measurements.drop(columns='item_id', inplace=True) measurements['time'] = pd.to_datetime(measurements['time'], utc=True, unit='s') measurements.set_index('time', inplace=True) # set index und columns as LEVEL_NAMES measurements = measurements.tz_convert(tz) measurements.columns = pd.MultiIndex.from_arrays( measurements_conf, names=LEVEL_NAMES) # measurements_conf = [['power'], ['active']] if sort_index: measurements.sort_index(inplace=True) key = Key(building=house_nr, meter=channel) # write data dest_file.put(str(key), measurements)
def convert_iawe(iawe_path, hdf_filename): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ assert isdir(iawe_path) # Open HDF5 file store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 13): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename) df.index = pd.to_datetime((df.timestamp.values * 1E9).astype(int), utc=True) df = df.tz_convert('Asia/Kolkata') df = df.drop('timestamp', 1) df.rename(columns=lambda x: column_mapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.convert_objects(convert_numeric=True) df = df.dropna() df = df.astype(np.float32) df = df.sort_index() store.put(str(key), df, format='table') store.flush() store.close() convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), hdf_filename) print("Done converting iAWE to HDF5!")
def _convert(input_path, hdf_filename, measurement_mapping_func, tz): """ Parameters ---------- input_path : str The root path of the REDD low_freq dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). measurement_mapping_func : function Must take these parameters: - house_id - chan_id Function should return a list of tuples e.g. [('power', 'active')] tz : str Timezone e.g. 'US/Eastern' """ assert isdir(input_path) # Open HDF5 file store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') # Iterate though all houses and channels houses = _find_all_houses(input_path) for house_id in houses: print("Loading house", house_id, end="... ") stdout.flush() chans = _find_all_chans(input_path, house_id) for chan_id in chans: print(chan_id, end=" ") stdout.flush() key = Key(building=house_id, meter=chan_id) measurements = measurement_mapping_func(house_id, chan_id) df = _load_chan(input_path, key, measurements, tz) store.put(str(key), df, format='table') store.flush() print() store.close()
def convert_redd(redd_path, hdf_filename): """ Parameters ---------- redd_path : str The root path of the REDD low_freq dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ assert isdir(redd_path) # Open HDF5 file store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') # Iterate though all houses and channels houses = _find_all_houses(redd_path) for house_id in houses: print("Loading house", house_id, end="... ") stdout.flush() chans = _find_all_chans(redd_path, house_id) for chan_id in chans: print(chan_id, end=" ") stdout.flush() key = Key(building=house_id, meter=chan_id) ac_type = 'apparent' if chan_id <= 2 else 'active' df = _load_chan(redd_path, key, [('power', ac_type)]) store.put(str(key), df, format='table') store.flush() print() store.close() # Add metadata convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), hdf_filename) print("Done converting REDD to HDF5!")
def convert_iawe(iawe_path, output_filename, format="HDF"): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. output_filename : str The destination filename (including path and suffix). """ check_directory_exists(iawe_path) # Open data store store = get_datastore(output_filename, format, mode='w') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 13): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename) df.drop_duplicates(subset=["timestamp"], inplace=True) df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) df = df.tz_convert(TIMEZONE) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.rename(columns=lambda x: column_mapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.convert_objects(convert_numeric=True) df = df.dropna() df = df.astype(np.float32) df = df.sort_index() store.put(str(key), df) store.close() convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting iAWE to HDF5!")
def _convert(input_path, store, measurement_mapping_func, tz, sort_index=True): """ Parameters ---------- input_path : str The root path of the REDD low_freq dataset. store : DataStore The NILMTK DataStore object. measurement_mapping_func : function Must take these parameters: - house_id - chan_id Function should return a list of tuples e.g. [('power', 'active')] tz : str Timezone e.g. 'US/Eastern' sort_index : bool """ check_directory_exists(input_path) # Iterate though all houses and channels houses = _find_all_houses(input_path) for house_id in houses: print("Loading house", house_id, end="... ") stdout.flush() chans = _find_all_chans(input_path, house_id) for chan_id in chans: print(chan_id, end=" ") stdout.flush() key = Key(building=house_id, meter=chan_id) measurements = measurement_mapping_func(house_id, chan_id) csv_filename = _get_csv_filename(input_path, key) df = _load_csv(csv_filename, measurements, tz) if sort_index: df = df.sort_index() # raw REDD data isn't always sorted store.put(str(key), df) print()
timeFiller = np.setdiff1d(np.copy(timeStamps), timeFiltered) wattsFiller = np.zeros(timeFiller.shape) # combine the on and off data timeAll = np.append(timeFiller, timeFiltered) wattsAll = np.append(wattsFiller, wattsFiltered) # format dataframe data structure and save in nilmtk format df = pd.DataFrame({('power', 'apparent'): wattsAll}, dtype=float) df.index = pd.to_datetime(timeAll, format='%Y-%m-%d %H:%M:%S', exact=False, utc=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.tz_convert('US/Eastern') key = Key(building=1, meter=instance + 1) store.put(str(key), df) ## create the metadata files in accordance with nilmtk guidelines # building metatdata if not os.path.exists(pJoin(modelDir, 'train')): os.makedirs(pJoin(modelDir, 'train')) f = open(pJoin(modelDir, 'train', 'building1.yaml'), 'w') f.write('instance: 1\n') f.write('elec_meters:\n') for instance, app in enumerate(np.unique(appliances)): if instance == 0: f.write(' ' + '1: &generic\n') f.write(' ' + 'submeter_of: 0\n') f.write(' ' + 'device_model: generic\n')
def convert_gjw(gjw_path, output_filename): """ Parameters ---------- gjw_path : str The root path of the gjw dataset. output_filename : str The destination filename (including path and suffix), will default if not specified directory and file structure nilm_gjw_data building<1> elec 4-POWER_REAL_FINE <date> Dump.csv 5-POWER_REACTIVE_STANDARD <date> Dump.csv ... ... building<n> HDF5 nilm_gjw_data.hdf5 metadata building1.yaml dataset.yaml meter_devices.yaml other files """ if gjw_path is None: gjw_path = home_dir check_directory_exists(gjw_path) os.chdir(gjw_path) gjw_path = os.getcwd( ) # sort out potential issue with slashes or backslashes if output_filename is None: output_filename = join(home_dir, 'HDF5', 'nilm_gjw_data.hdf5') # Open data store print('opening datastore', output_filename) store = get_datastore(output_filename, format, mode='w') # walk the directory tree from the dataset home directory #clear dataframe & add column headers df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME, REACTIVE_COLUMN_NAME]) found = False for current_dir, _, files in os.walk(gjw_path): #unused second parameter of for dirs_in_current_dir if current_dir.find('.git') != -1 or current_dir.find('.ipynb') != -1: #print( 'Skipping ', current_dir) continue print('checking', current_dir) m = bld_re.search(current_dir) if m: #The csv files may be further down the tree so this section may be repeated building_name = m.group() building_nbr = int(bld_nbr_re.search(building_name).group()) meter_nbr = 1 key = Key(building=building_nbr, meter=meter_nbr) for items in fnmatch.filter(files, "4*.csv"): # process any .CSV files found found = True ds = iso_date_re.search(items).group() # print( 'found files for date:', ds,end=" ") # found files to process df1 = _read_file_pair(current_dir, ds) # read two csv files into a dataframe df = pd.concat( [df, df1]) # concatenate the results into one long dataframe if found: found = False df = _prepare_data_for_toolkit(df) _summarise_dataframe(df, 'Prepared for tool kit') store.put(str(key), df) #clear dataframe & add column headers #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME]) break # only 1 folder with .csv files at present store.close() convert_yaml_to_hdf5(join(gjw_path, 'metadata'), output_filename) print("Done converting gjw to HDF5!")
def convert_sortd(input_path, output_filename, format='HDF'): """Converts the dataset to NILMTK HDF5 format. For more information about the SOR test dataset, contact Samuel Marisa. Parameters ---------- input_path : str The root path of the dataset. It is assumed that the YAML metadata is in 'input_path/metadata'. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' Example usage: -------------- convert('/sortd', 'store.h5') """ print( 'Attempting to convert the SORTD dataset at %s into %s in NILMTK %s format...' % (input_path, output_filename, format)) # Ensure that the input directory exists check_directory_exists(input_path) # Load the dataset metadata with open(join(input_path, 'metadata/dataset.yaml'), 'r') as stream: dataset_metadata = yaml.load(stream) # Open the datastore store = get_datastore(output_filename, format, mode='w') # Iterate through all building metadata files found in the dataset for metadata_file in glob.glob( join(input_path, 'metadata/building[0-9]*.yaml')): # Load the building metadata with open(metadata_file, 'r') as stream: metadata = yaml.load(stream) building_id = int(metadata['instance']) print('==> Loading building %d defined at %s. Please wait...' % (building_id, metadata_file)) for meter_id, meter_data in metadata['elec_meters'].items(): meter_id = int(meter_id) key = Key(building=building_id, meter=meter_id) # Load the raw data from the data location print(' - Loading meter %s from %s...' % (meter_id, meter_data['data_location'])) columns = [('power', 'active')] df = pd.read_csv(join(input_path, meter_data['data_location']), sep=',', names=columns, dtype={m: np.float32 for m in columns}) # Convert the timestamp index column to timezone-aware datetime df.index = pd.to_datetime(df.index.values, unit='s', utc=True) df = df.tz_convert(dataset_metadata['timezone']) #df = pd.read_csv(join(input_path, db_file), sep=';', names=('Datetime', 'P1', 'P2', 'P3'), dtype={'P1': np.float64, 'P2': np.float64, 'P3': np.float64}, parse_dates=[1]) print(df.info()) print(df.head()) #print(df.tail()) print(" - Storing data under key %s in the datastore..." % (str(key))) store.put(str(key), df) print(" - Building %s loaded!" % (building_id)) print("Adding the metadata into the store...") save_yaml_to_datastore(join(input_path, 'metadata'), store) print("Closing the store...") store.close() print("Done converting SORTD dataset to HDF5!")
def _dataport_dataframe_to_hdf(dataport_dataframe, store, nilmtk_building_id, dataport_building_id): local_dataframe = dataport_dataframe.copy() # remove timezone information to avoid append errors local_dataframe['localminute'] = pd.DatetimeIndex([i.replace(tzinfo=None) for i in local_dataframe['localminute']]) # set timestamp as frame index local_dataframe = local_dataframe.set_index('localminute') # set timezone local_dataframe = local_dataframe.tz_localize('US/Central') # remove timestamp column from dataframe feeds_dataframe = local_dataframe.drop('dataid', axis=1) # Column names for dataframe column_names = [('power', 'active')] # convert from kW to W feeds_dataframe = feeds_dataframe.mul(1000) # building metadata building_metadata = {} building_metadata['instance'] = nilmtk_building_id building_metadata['original_name'] = int(dataport_building_id) # use python int building_metadata['elec_meters'] = {} building_metadata['appliances'] = [] # initialise dict of instances of each appliance type instance_counter = {} meter_id = 1 for column in feeds_dataframe.columns: if feeds_dataframe[column].notnull().sum() > 0 and not column in feed_ignore: # convert timeseries into dataframe feed_dataframe = pd.DataFrame(feeds_dataframe[column]) # set column names feed_dataframe.columns = pd.MultiIndex.from_tuples(column_names) # Modify the column labels to reflect the power measurements recorded. feed_dataframe.columns.set_names(LEVEL_NAMES, inplace=True) key = Key(building=nilmtk_building_id, meter=meter_id) # store dataframe store.put(str(key), feed_dataframe, format='table', append=True) store.flush() # elec_meter metadata if column == 'use': meter_metadata = {'device_model': 'eGauge', 'site_meter': True} else: meter_metadata = {'device_model': 'eGauge', 'submeter_of': 0} building_metadata['elec_meters'][meter_id] = meter_metadata # appliance metadata if column != 'use': # original name and meter id appliance_metadata = {'original_name': column, 'meters': [meter_id] } # appliance type and room if available appliance_metadata.update(feed_mapping[column]) # appliance instance number if instance_counter.get(appliance_metadata['type']) == None: instance_counter[appliance_metadata['type']] = 0 instance_counter[appliance_metadata['type']] += 1 appliance_metadata['instance'] = instance_counter[appliance_metadata['type']] building_metadata['appliances'].append(appliance_metadata) meter_id += 1 # write building yaml to file building = 'building{:d}'.format(nilmtk_building_id) yaml_full_filename = join(_get_module_directory(), 'metadata', building + '.yaml') with open(yaml_full_filename, 'w') as outfile: outfile.write(yaml.dump(building_metadata)) return 0
def _dataport_dataframe_to_hdf(dataport_dataframe, store, nilmtk_building_id, dataport_building_id): local_dataframe = dataport_dataframe.copy() # WK: using local time zone seems problematic with NotExistenceTimeError and AmbiguousError, let's stick to UTC # WK: After thorough examination, I GUESS that dataport will return UTC time if possible. When there is ambiguous # dst time, the returned data would be tz=None. In this case, we should manually convert the 'localminute' # column back to UTC time by handling the ambiguity. if pd.DatetimeIndex(local_dataframe['localminute']).tzinfo != pytz.UTC: print('NOT UTC encountered, localminute dtype is: %s' % local_dataframe['localminute'].dtype) local_dataframe['localminute'] = pd.DatetimeIndex( local_dataframe['localminute']).tz_localize('UTC') # WK: the following line is commented by WK # remove timezone information to avoid append errors # local_dataframe['localminute'] = pd.DatetimeIndex([i.replace(tzinfo=None) # for i in local_dataframe['localminute']]) # set timestamp as frame index local_dataframe = local_dataframe.set_index('localminute') # WK: the following line is commented by WK # set timezone # local_dataframe = local_dataframe.tz_localize('US/Central', ambiguous='infer') # remove timestamp column from dataframe feeds_dataframe = local_dataframe.drop('dataid', axis=1) # Column names for dataframe column_names = [('power', 'active')] # convert from kW to W feeds_dataframe = feeds_dataframe.mul(1000) # building metadata building_metadata = {} building_metadata['instance'] = nilmtk_building_id building_metadata['original_name'] = int( dataport_building_id) # use python int building_metadata['elec_meters'] = {} building_metadata['appliances'] = [] # initialise dict of instances of each appliance type instance_counter = {} meter_id = 1 for column in feeds_dataframe.columns: if feeds_dataframe[column].notnull().sum( ) > 0 and not column in feed_ignore: # convert timeseries into dataframe feed_dataframe = pd.DataFrame(feeds_dataframe[column]) # set column names feed_dataframe.columns = pd.MultiIndex.from_tuples(column_names) # Modify the column labels to reflect the power measurements recorded. feed_dataframe.columns.set_names(LEVEL_NAMES, inplace=True) key = Key(building=nilmtk_building_id, meter=meter_id) if feed_dataframe.index.tzinfo != pytz.UTC: print('NOT UTC timezone!!! double check!') # store dataframe store.put(str(key), feed_dataframe, format='table', append=True) # WK: before store, should check tzinfo!! store.flush() # elec_meter metadata if column == 'use': meter_metadata = {'device_model': 'eGauge', 'site_meter': True} else: meter_metadata = {'device_model': 'eGauge', 'submeter_of': 0} building_metadata['elec_meters'][meter_id] = meter_metadata # appliance metadata if column != 'use': # original name and meter id appliance_metadata = { 'original_name': column, 'meters': [meter_id] } # appliance type and room if available appliance_metadata.update(feed_mapping[column]) # appliance instance number if instance_counter.get(appliance_metadata['type']) == None: instance_counter[appliance_metadata['type']] = 0 instance_counter[appliance_metadata['type']] += 1 appliance_metadata['instance'] = instance_counter[ appliance_metadata['type']] building_metadata['appliances'].append(appliance_metadata) meter_id += 1 # write building yaml to file building = 'building{:d}'.format(nilmtk_building_id) yaml_full_filename = join(_get_module_directory(), 'metadata', building + '.yaml') with open(yaml_full_filename, 'w') as outfile: outfile.write(yaml.dump(building_metadata)) return 0
def convert_greend(greend_path, hdf_filename): """ Parameters ---------- greend_path : str The root path of the greend dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') houses = sorted(__get_houses(greend_path)) print(houses) h = 1 # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1 for house in houses: print('loading ' + house) abs_house = join(greend_path, house) dates = [d for d in listdir(abs_house) if d.startswith('dataset')] house_data = [] for date in dates: print('-----------------------', date) try: tmp_pandas = pd.read_csv(join(abs_house, date), na_values=['na'], error_bad_lines=False) except: # A ParserError/ValueError is returned for malformed files (irregular column number) pass # for building0 either remove the first days (with less nodes) or use __preprocess_file #import StringIO as sio #tmp_pandas = pd.DataFrame.from_csv(sio.StringIO(__preprocess_file(abs_house, date))) # if the timestamp is not correctly parsed then it's an object dtype (string), else a float64 if tmp_pandas.timestamp.dtype != np.float64: tmp_pandas = tmp_pandas[tmp_pandas.timestamp != 'timestamp'] # remove all error rows # use the cleaned column as the index tmp_pandas.index = tmp_pandas["timestamp"].apply( pd.to_numeric, errors='ignore').values tmp_pandas = tmp_pandas.drop( 'timestamp', 1 ) # remove timestamp from the columns (it's the index already) tmp_pandas = tmp_pandas.astype( "float32") # convert everything back to float32 # convert the index to datetime tmp_pandas.index = pd.to_datetime(tmp_pandas.index, unit='s') tmp_pandas = tmp_pandas.tz_localize("UTC").tz_convert("CET") tmp_pandas = tmp_pandas.drop_duplicates() #tmp_pandas = tmp_pandas.sort_index() house_data.append(tmp_pandas) overall_df = pd.concat(house_data) overall_df = overall_df.drop_duplicates() overall_df = overall_df.sort_index() m = 1 for column in overall_df.columns: print("meter" + str(m) + ': ' + column) key = Key(building=h, meter=m) print("Putting into store...") store.put(str(key), overall_df[column], format='table') m += 1 print('Flushing store...') store.flush() h += 1 store.close() # retrieve the dataset metadata in the metadata subfolder import inspect convert_yaml_to_hdf5( dirname(inspect.getfile(convert_greend)) + '/metadata/', hdf_filename)
def convert_eco(dataset_loc, hdf_filename, timezone): """ Parameters: ----------- dataset_loc: str The root directory where the dataset is located. hdf_filename: str The location where the hdf_filename is present. The directory location has to contain the hdf5file name for the converter to work. timezone: str specifies the timezone of the dataset. """ # Creating a new HDF File store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc') check_directory_exists(dataset_loc) directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i] directory_list.sort() print(directory_list) found_any_sm = False found_any_plug = False # Traversing every folder for folder in directory_list: if folder[0] == '.' or folder[-3:] == '.h5': print('Skipping ', folder) continue #Building number and meter_flag building_no = int(folder[:2]) meter_flag = None if 'sm_csv' in folder: meter_flag = 'sm' elif 'plugs' in folder: meter_flag = 'plugs' else: print('Skipping folder', folder) continue print('Computing for folder', folder) dir_list = [ i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc, folder, i)) ] dir_list.sort() if meter_flag == 'plugs' and len(dir_list) < 3: # Try harder to find the subfolders folder = join(folder, folder[:2]) dir_list = [ i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc, folder, i)) ] print('Current dir list:', dir_list) for fl in dir_list: print('Computing for folder ', fl) fl_dir_list = [ i for i in listdir(join(dataset_loc, folder, fl)) if '.csv' in i ] fl_dir_list.sort() if meter_flag == 'sm': for fi in fl_dir_list: found_any_sm = True df = pd.read_csv(join(dataset_loc, folder, fl, fi), names=[i for i in range(1, 17)], dtype=np.float32) # SmartMeter for phase in range(1, 4): key = str(Key(building=building_no, meter=phase)) df_phase = df.loc[:, [ 1 + phase, 5 + phase, 8 + phase, 13 + phase ]] # get reactive power power = df_phase.loc[:, (1 + phase, 13 + phase)].values reactive = power[:, 0] * np.tan( power[:, 1] * np.pi / 180) df_phase['Q'] = reactive df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT') df_phase = df_phase.tz_convert(timezone) sm_column_name = { 1 + phase: ('power', 'active'), 5 + phase: ('current', ''), 8 + phase: ('voltage', ''), 13 + phase: ('phase_angle', ''), 'Q': ('power', 'reactive'), } df_phase.columns = pd.MultiIndex.from_tuples( sm_column_name[col] for col in df_phase.columns) power_active = df_phase['power', 'active'] tmp_before = np.size(power_active) df_phase = df_phase[power_active != -1] power_active = df_phase['power', 'active'] tmp_after = np.size(power_active) if tmp_before != tmp_after: print( 'Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) df_phase.columns.set_names(LEVEL_NAMES, inplace=True) if not key in store: store.put(key, df_phase, format='Table') else: store.append(key, df_phase, format='Table') store.flush() print('Building', building_no, ', Meter no.', phase, '=> Done for ', fi[:-4]) # Plugs werden auch in Meter uebersetzt und dann aber direkt mit Appliances ergaenzt else: #Meter number to be used in key meter_num = int(fl) + 3 key = str(Key(building=building_no, meter=meter_num)) current_folder = join(dataset_loc, folder, fl) if not fl_dir_list: raise RuntimeError("No CSV file found in " + current_folder) #Getting dataframe for each csv file seperately for fi in fl_dir_list: found_any_plug = True df = pd.read_csv(join(current_folder, fi), names=[1], dtype=np.float64) df.index = pd.DatetimeIndex(start=fi[:-4].replace( '.', ':'), freq='s', periods=86400, tz='GMT') df.columns = pd.MultiIndex.from_tuples( plugs_column_name.values()) df = df.tz_convert(timezone) df.columns.set_names(LEVEL_NAMES, inplace=True) # Check whether measurements removed tmp_before = np.size(df.power.active) df = df[df.power.active != -1] tmp_after = np.size(df.power.active) if (tmp_before != tmp_after): print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) # If table not present in hdf5, create or else append to existing data if not key in store: store.put(key, df, format='Table') print('Building', building_no, ', Meter no.', meter_num, '=> Done for ', fi[:-4]) else: store.append(key, df, format='Table') store.flush() print('Building', building_no, ', Meter no.', meter_num, '=> Done for ', fi[:-4]) if not found_any_plug or not found_any_sm: raise RuntimeError( 'The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)' ) print("Data storage completed.") store.close() # Adding the metadata to the HDF5file print("Proceeding to Metadata conversion...") meta_path = join(get_module_directory(), 'dataset_converters', 'eco', 'metadata') convert_yaml_to_hdf5(meta_path, hdf_filename) print("Completed Metadata conversion.")