def get_csv( path="/Users/rishi/Documents/Master_folder/IIITD/6th_semester/BTP/NIOMTK", output_filename="occupancy", format='HDF'): csv_files = [i for i in listdir(path) if '_csv' in i] csv_list = [] for i in csv_files: directory = join(path, i) file_list = [j for j in listdir(directory) if ".csv" in j] for j in file_list: csv_list.append(join(directory, j)) print(csv_list) print(len(csv_list)) store = get_datastore(output_filename, format, mode='w') for i in range(len(csv_list)): dataframe = pd.read_csv(csv_list[i]) out = [] print "The current file is: ", csv_list[i] df_new = [] for j in range(len(dataframe)): out.append(dataframe.ix[j].values[1:]) out_1d = list(chain.from_iterable(out)) index = pd.DatetimeIndex(start=dataframe.values[0][0], periods=len(out_1d), freq="1s") df = pd.DataFrame(out_1d, index) # key = Key(building=1, meter=(i + 1)) key = "/building" + str(i) + "/elec/meter" + str(i + 1) if "summer" in csv_list[i]: key = join(str(key), "summer") else: key = join(str(key), "winter") store.put(key, df)
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') for building_name, building_mapping in overall_dataset_mapping.iteritems(): for load_name, load_mapping in building_mapping.iteritems(): for load_mapping_path, meter_number in load_mapping.iteritems(): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute) print(filename_attribute) dfs.append(pd.read_csv(filename_attribute, parse_dates=True, index_col=0, header=True, names=[attribute])) total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.rename(columns=lambda x: column_mapping[x], inplace=True) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting COMBED to HDF5!")
def convert_lab(lab_path, output_filename, format='HDF'): """ Parameters ---------- lab_path : str The root path of the LAB dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ #estava <=2 e o primeiro ac_type = apparent def _lab_measurement_mapping_func(house_id, chan_id): ac_type = 'active' if chan_id <= 1 else 'active' return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(lab_path, store, _lab_measurement_mapping_func, 'America/Fortaleza') # Add metadata save_yaml_to_datastore( join(get_module_directory(), 'dataset_converters', 'lab', 'metadata'), store) store.close() print("Done converting LAB to HDF5!")
def convert_redd(redd_path, output_filename, format='HDF'): """ Parameters ---------- redd_path : str The root path of the REDD low_freq dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ def _redd_measurement_mapping_func(house_id, chan_id): return [('power', 'active')] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(redd_path, store, _redd_measurement_mapping_func, 'Asia/Taipei') s = redd_path + 'metadata' # Add metadata save_yaml_to_datastore(s, store) store.close() print("Done converting III to HDF5!")
def convert_redd(redd_path, output_filename, format="HDF"): """ Parameters ---------- redd_path : str The root path of the REDD low_freq dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ def _redd_measurement_mapping_func(house_id, chan_id): ac_type = "apparent" if chan_id <= 2 else "active" return [("power", ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode="w") # Convert raw data to DataStore _convert(redd_path, store, _redd_measurement_mapping_func, "US/Eastern") # Add metadata save_yaml_to_datastore(join(get_module_directory(), "dataset_converters", "redd", "metadata"), store) store.close() print("Done converting REDD to HDF5!")
def convert_hipe(hipe_path, output_filename, format="HDF"): """Convert the HIPE data set to the NILMTK-format. This method works with the 1 week and the 3 month data. Parameters ---------- hipe_path : str The root path of the HIPE dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either "HDF" or "CSV". Defaults to "HDF". """ datastore = get_datastore(output_filename, format, mode="w") _convert(hipe_path, datastore, _hipe_measurement_mapping_func, "Europe/Berlin") metadata_path = "metadata" save_yaml_to_datastore(metadata_path, datastore) datastore.close() print("Done converting HIPE!")
def convert_enertalk(input_path, output_filename, format='HDF', tz='Asia/Seoul'): """ Parameters ---------- input_path : str The root path of ENEERTAK dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' tz : str Timezone e.g. 'Asia/Seoul' """ # Open DataStore store = get_datastore(output_filename, format, mode='w') # convert raw data to DataStore _convert(input_path, store, tz=tz) # Add metadata save_yaml_to_datastore('metadata/', store) store.close()
def convert_refit(input_path, output_filename, format='HDF'): """ Parameters ---------- input_path : str The root path of the CSV files, e.g. House1.csv output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(input_path, store, 'Europe/London') # Add metadata save_yaml_to_datastore( join(get_module_directory(), 'dataset_converters', 'refit', 'metadata'), store) store.close() print("Done converting REFIT to HDF5!")
def convert_alva(alva_path, output_filename, format='HDF'): """ Parameters ---------- alva_path : str The root path of the alva low_freq dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ def _alva_measurement_mapping_func(house_id, chan_id): ac_type = 'apparent' if chan_id <= 2 else 'active' return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(alva_path, store, _alva_measurement_mapping_func, 'US/Eastern') # Add metadata save_yaml_to_datastore(join(get_module_directory(), 'dataset_converters', 'alva', 'metadata'), store) store.close() print("Done converting alva to HDF5!")
def convert_deddiag(connection, output_filename, format='HDF', start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE, tz=DEFAULT_TZ): """ Parameters ---------- connection: Connection Connection to the DEDDIAG database Example: connection = Connection(host="localhost", port="5432", db_name="postgres", user="******", password="******") output_filename : str The destination filename including path and suffix Example: ./data/deddiag.h5 format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ # Open DataStore # todo try catch dest_file = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(connection, dest_file, start_date, end_date, tz) path_to_metadata = join(get_module_directory(), 'dataset_converters', 'deddiag', 'metadata') # Add metadata save_yaml_to_datastore(path_to_metadata, dest_file) print("Done converting DEDDIAG to HDF5!")
def convert_lab(lab_path, output_filename, format='HDF'): """ Parameters ---------- lab_path : str The root path of the LAB dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ #estava <=2 e o primeiro ac_type = apparent def _lab_measurement_mapping_func(house_id, chan_id): ac_type = 'active' if chan_id <= 1 else 'active' return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(lab_path, store, _lab_measurement_mapping_func, 'America/Fortaleza') # Add metadata save_yaml_to_datastore(join(get_module_directory(), 'dataset_converters', 'lab', 'metadata'), store) store.close() print("Done converting LAB to HDF5!")
def convert_ampds(input_path, output_filename, format='HDF'): """ Convert AMPds R2013 as seen on Dataverse. Download the files as CSVs and put them in the `input_path` folder for conversion. Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO Parameters: ----------- input_path: str The path of the directory where all the csv files are supposed to be stored output_filename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. format: str Defaults to HDF5 Example usage: -------------- convert('/AMPds/electricity', 'store.h5') """ check_directory_exists(input_path) files = [ f for f in listdir(input_path) if isfile(join(input_path, f)) and '.csv' in f and '.swp' not in f ] # Sorting Lexicographically files.sort() # Remove Whole Home and put it at top files.remove("WHE.csv") files.insert(0, "WHE.csv") assert isdir(input_path) store = get_datastore(output_filename, format, mode='w') for i, csv_file in enumerate(files): key = Key(building=1, meter=(i + 1)) print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...') df = pd.read_csv(join(input_path, csv_file)) # Due to fixed width, column names have spaces :( df.columns = [x.replace(" ", "") for x in df.columns] df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit='s', utc=True) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df = df.tz_convert(TIMEZONE) df.rename(columns=lambda x: columnNameMapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) store.put(str(key), df) print("Done with file #", (i + 1)) store.close() metadata_path = join(_get_module_directory(), 'metadata') print('Processing metadata...') convert_yaml_to_hdf5(metadata_path, output_filename)
def convert_redd(redd_path, output_filename, format='HDF'): """ Parameters ---------- redd_path : str The root path of the REDD low_freq dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ def _redd_measurement_mapping_func(house_id, chan_id): ac_type = 'apparent' if chan_id <= 2 else 'active' return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(redd_path, store, _redd_measurement_mapping_func, 'US/Eastern') s = join(get_module_directory(), 'dataset_converters', 'redd', 'metadata') # Add metadata save_yaml_to_datastore( join(get_module_directory(), 'dataset_converters', 'redd', 'metadata'), store) store.close() print("Done converting REDD to HDF5!")
def convert_refit(input_path, output_filename, format='HDF'): """ Parameters ---------- input_path : str The root path of the CSV files, e.g. House1.csv output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(input_path, store, 'Europe/London') # Add metadata save_yaml_to_datastore(join(get_module_directory(), 'dataset_converters', 'refit', 'metadata'), store) store.close() print("Done converting REFIT to HDF5!")
def convert_ideal(ideal_path, output_filename, format='HDF'): """ Convert the IDEAL dataset to NILMTK HDF5 format. From https://datashare.ed.ac.uk/handle/10283/3647 download these zips below: - household_sensors.zip (14.77Gb). - room_and_appliance_sensors.zip (9.317Gb). Both zips contain a folder called "sensorsdata". Create a new folder, e.g. called "ideal_dataset", and into it - Extract the folder "household_sensors.zip/sensordata" with the name household_sensordata - Extract the folder "room_and_appliance_sensors/sensordata" with the name rooms_appliance_sensensensordata Then run the function convert_ideal with ideal_path="ideal_dataset". Parameters ---------- ideal_path : str The root path of the ideal low_freq dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ def _ideal_measurement_mapping_func(house_id, chan_id, category_id): if (category_id == "electric-appliance"): ac_type = 'active' return [('power', ac_type)] else: ac_type = 'apparent' return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') #household_sensordata contains mains reading #rooms_appliance_sensordata contains appliance reading folders = [] for root, dirs, files in os.walk(ideal_path): for folder in dirs: if (folder == "household_sensordata" or folder == "rooms_appliance_sensordata"): folders.append(folder) #valid_home_id are home ids which contain both mains and appliance reading valid_home_id = mains_plus_appliance_home_id(ideal_path, folders) for folder in folders: input_path = join(ideal_path, folder) # Convert raw data to DataStore _convert(input_path, store, _ideal_measurement_mapping_func, 'Europe/London', valid_home_id) metadata_path = join(get_module_directory(), 'dataset_converters', 'ideal', 'metadata') # Add metadata save_yaml_to_datastore(metadata_path, store) store.close() print("Done converting ideal to HDF5!")
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') any_file_converted = False for building_name, building_mapping in iteritems(overall_dataset_mapping): for load_name, load_mapping in iteritems(building_mapping): for load_mapping_path, meter_number in iteritems(load_mapping): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute) if not os.path.isfile(filename_attribute): # File not found directly in the combed_path provided # Try adding 'iiitd' to it filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute) if os.path.isfile(filename_attribute): exists = True print(filename_attribute) df = pd.read_csv(filename_attribute, names=["timestamp", attribute]) df.index = pd.to_datetime(df["timestamp"], unit='ms') df = df.drop("timestamp", 1) dfs.append(df) else: exists = False if exists: total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns]) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) any_file_converted = True if not any_file_converted: raise RuntimeError('No files converted, did you specify the correct path?') convert_yaml_to_hdf5( join(get_module_directory(), 'dataset_converters', 'combed', 'metadata'), output_filename ) print("Done converting COMBED to HDF5!")
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') any_file_converted = False for building_name, building_mapping in iteritems(overall_dataset_mapping): for load_name, load_mapping in iteritems(building_mapping): for load_mapping_path, meter_number in iteritems(load_mapping): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute) if not os.path.isfile(filename_attribute): # File not found directly in the combed_path provided # Try adding 'iiitd' to it filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute) if os.path.isfile(filename_attribute): exists = True print(filename_attribute) df = pd.read_csv(filename_attribute, names=["timestamp", attribute]) df.index = pd.to_datetime(df["timestamp"], unit='ms') df = df.drop("timestamp", 1) dfs.append(df) else: exists = False if exists: total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns]) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) any_file_converted = True if not any_file_converted: raise RuntimeError('No files converted, did you specify the correct path?') convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting COMBED to HDF5!")
def convert_ampds(input_path, output_filename, format='HDF'): """ Convert AMPds R2013 as seen on Dataverse. Download the files as CSVs and put them in the `input_path` folder for conversion. Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO Parameters: ----------- input_path: str The path of the directory where all the csv files are supposed to be stored output_filename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. format: str Defaults to HDF5 Example usage: -------------- convert('/AMPds/electricity', 'store.h5') """ check_directory_exists(input_path) files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and '.csv' in f and '.swp' not in f] # Sorting Lexicographically files.sort() # Remove Whole Home and put it at top files.remove("WHE.csv") files.insert(0, "WHE.csv") assert isdir(input_path) store = get_datastore(output_filename, format, mode='w') for i, csv_file in enumerate(files): key = Key(building=1, meter=(i + 1)) print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...') df = pd.read_csv(join(input_path, csv_file)) # Due to fixed width, column names have spaces :( df.columns = [x.replace(" ", "") for x in df.columns] df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit='s', utc=True) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df = df.tz_convert(TIMEZONE) df.rename(columns=lambda x: columnNameMapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) store.put(str(key), df) print("Done with file #", (i + 1)) store.close() metadata_path = join(get_module_directory(), 'dataset_converters', 'ampds', 'metadata') print('Processing metadata...') convert_yaml_to_hdf5(metadata_path, output_filename)
def convert_ukdale(ukdale_path, output_filename, format='HDF', drop_duplicates=True): """Converts the UK-DALE dataset to NILMTK HDF5 format. For more information about the UK-DALE dataset, and to download it, please see http://www.doc.ic.ac.uk/~dk3810/data/ Parameters ---------- ukdale_path : str The root path of the UK-DALE dataset. It is assumed that the YAML metadata is in 'ukdale_path/metadata'. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' drop_duplicates : bool Remove entries with duplicated timestamp (keeps the first value) Defaults to True. """ ac_type_map = _get_ac_type_map(ukdale_path) def _ukdale_measurement_mapping_func(house_id, chan_id): ac_type = ac_type_map[(house_id, chan_id)][0] return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert 6-second data _convert(ukdale_path, store, _ukdale_measurement_mapping_func, TZ, sort_index=False, drop_duplicates=drop_duplicates) store.close() # Add metadata if format == 'HDF': convert_yaml_to_hdf5(join(ukdale_path, 'metadata'), output_filename) # Convert 1-second data store.open(mode='a') _convert_one_sec_data(ukdale_path, store, ac_type_map, drop_duplicates) store.close() print("Done converting UK-DALE to HDF5!")
def convert_ampds(input_path, output_filename, format="HDF"): """ Parameters: ----------- input_path: str The path of the directory where all the csv files are supposed to be stored output_filename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. format: str Defaults to HDF5 Example usage: -------------- convert('/AMPds/electricity', 'store.h5') """ check_directory_exists(input_path) files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and ".csv" in f and ".swp" not in f] # Sorting Lexicographically files.sort() # Remove Whole Home and put it at top files.remove("WHE.csv") files.insert(0, "WHE.csv") assert isdir(input_path) store = get_datastore(output_filename, format, mode="w") for i, csv_file in enumerate(files): key = Key(building=1, meter=(i + 1)) print("Loading file #", (i + 1), " : ", csv_file, ". Please wait...") df = pd.read_csv(join(input_path, csv_file)) # Due to fixed width, column names have spaces :( df.columns = [x.replace(" ", "") for x in df.columns] df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit="s", utc=True) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df = df.tz_localize("GMT").tz_convert(TIMEZONE) df.rename(columns=lambda x: columnNameMapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.convert_objects(convert_numeric=True) df = df.dropna() df = df.astype(np.float32) store.put(str(key), df) print("Done with file #", (i + 1)) store.close() metadata_path = join(_get_module_directory(), "metadata") print("Processing metadata...") convert_yaml_to_hdf5(metadata_path, output_filename)
def convert_ps(ps_path, output_path, out_format="HDF"): # open datastore store = get_datastore(output_path, out_format, mode="w") # TODO: check 'US/Central' data_path = join(ps_path, "data") _convert_to_datastore(data_path, store, 'US/Central') # add metadata meta_path = join(ps_path, "meta") save_yaml_to_datastore(meta_path, store) store.close() print ("Done converting Pecan Street to HDF5")
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') for building_name, building_mapping in iteritems(overall_dataset_mapping): for load_name, load_mapping in iteritems(building_mapping): for load_mapping_path, meter_number in iteritems(load_mapping): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" % attribute) if os.path.isfile(filename_attribute): exists = True print(filename_attribute) df = pd.read_csv(filename_attribute, header=True, names=["timestamp", attribute]) df.index = pd.to_datetime(df["timestamp"], unit='ms') df = df.drop("timestamp", 1) dfs.append(df) else: exists = False if exists: total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.rename(columns=lambda x: column_mapping[x], inplace=True) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting COMBED to HDF5!")
def convert_iawe(iawe_path, output_filename, format="HDF"): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. output_filename : str The destination filename (including path and suffix). """ check_directory_exists(iawe_path) idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ) idx = idx.tz_localize('GMT').tz_convert(TIMEZONE) # Open data store store = get_datastore(output_filename, format, mode='w') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 12): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename, dtype=np.float64, na_values='\\N') df.drop_duplicates(subset=["timestamp"], inplace=True) df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) df = df.tz_convert(TIMEZONE) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.columns = pd.MultiIndex.from_tuples( [column_mapping[x] for x in df.columns], names=LEVEL_NAMES) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) df = df.sort_index() df = df.resample("1T").mean() df = reindex_fill_na(df, idx) assert df.isnull().sum().sum() == 0 store.put(str(key), df) store.close() metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe', 'metadata') convert_yaml_to_hdf5(metadata_dir, output_filename) print("Done converting iAWE to HDF5!")
def main(): parser = argparse.ArgumentParser() parser.add_argument('inpath', help='input directory (ANTgen output)', nargs='?', default='../output') parser.add_argument('outfile', help='output file (HDF5 file)', nargs='?', default='../output/ANTgen.h5') args = parser.parse_args() if not os.path.exists('metadata') or not os.path.isfile('metadata/building1.yaml'): print("No metadata found. Please run 'generate_metadata.py' before using this tool...") exit(1) print("Converting ANTgen output from '{}' to file '{}'".format(args.inpath, args.outfile)) with open('metadata/building1.yaml', 'r') as f: yaml_dict = yaml.load(f, Loader=yaml.FullLoader) channel_list = ['total'] # pre-populate with aggregate data (total.csv) for app in yaml_dict['appliances']: channel_list.append(app['original_name']) store = get_datastore(args.outfile, 'HDF', mode='w') for i, app_name in enumerate(channel_list): print("Adding virtual meter ID {:02d}: {}".format(1+i, app_name)) key = Key(building=1, meter=(i + 1)) csvfile = os.path.join(args.inpath, str(app_name)+'.csv') try: df = pd.read_csv(csvfile, sep=';', encoding='utf-8', index_col=0) df.columns = pd.MultiIndex.from_tuples([('power', 'active') for x in df.columns], names=LEVEL_NAMES) df.index = pd.to_datetime(df.index) tz_naive = df.index tz_aware = tz_naive.tz_localize(tz='Europe/Vienna', ambiguous=True, nonexistent=pd.Timedelta('1H')) df.index = tz_aware df = df.tz_convert('Europe/Vienna') store.put(str(key), df) except FileNotFoundError: print("Input file '{}' not found - your HDF5 file will be incomplete!".format(csvfile)) continue print('Adding metadata...') convert_yaml_to_hdf5('metadata/', args.outfile)
def convert_caxe(file_path): ''' Parameters ------------ Takes input csv_file name to be tested as string. Data columns of the csv should contain following the following values in columns: timestamp,reactive_power,apparent_power,current,frequency,voltage,active_power) Converts it into hdf5 Format and save as test.h5. ''' df = pd.read_csv(f'{file_path}', names=['timestamp', 'R', 'A', 'C', 'F', 'V', 'T']) column_mapping = { 'F': ('frequency', ""), 'V': ('voltage', ""), 'T': ('power', 'active'), 'C': ('current', ''), 'R': ('power', 'reactive'), 'A': ('power', 'apparent'), } output_filename = 'test.h5' # Open data store store = get_datastore(output_filename, format='HDF', mode='w') key = Key(building=1, meter=1) print('Loading ', 1) df.index = pd.to_datetime(df.timestamp.values) df = df.tz_convert( TIMEZONE) # if error occurs use tz_localize for tz naive timestamps df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.index = pd.to_datetime(df.index.values) df.columns = pd.MultiIndex.from_tuples( [column_mapping[x] for x in df.columns], names=LEVEL_NAMES) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) df = df.sort_index() df = df.resample("1T").mean() assert df.isnull().sum().sum() == 0 store.put(str(key), df) store.close() convert_yaml_to_hdf5('./metadata', output_filename) print("Done converting test data to HDF5!")
def convert_iawe(iawe_path, output_filename, format="HDF"): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. output_filename : str The destination filename (including path and suffix). """ check_directory_exists(iawe_path) idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ) idx = idx.tz_localize('GMT').tz_convert(TIMEZONE) # Open data store store = get_datastore(output_filename, format, mode='w') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 12): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename, dtype=np.float64, na_values='\\N') df.drop_duplicates(subset=["timestamp"], inplace=True) df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) df = df.tz_convert(TIMEZONE) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.rename(columns=lambda x: column_mapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) df = df.sort_index() df = df.resample("1T").mean() df = reindex_fill_na(df, idx) assert df.isnull().sum().sum() == 0 store.put(str(key), df) store.close() metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe', 'metadata') convert_yaml_to_hdf5(metadata_dir, output_filename) print("Done converting iAWE to HDF5!")
def convert_ukdale(ukdale_path, output_filename, format='HDF'): """Converts the UK-DALE dataset to NILMTK HDF5 format. For more information about the UK-DALE dataset, and to download it, please see http://www.doc.ic.ac.uk/~dk3810/data/ Parameters ---------- ukdale_path : str The root path of the UK-DALE dataset. It is assumed that the YAML metadata is in 'ukdale_path/metadata'. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ ac_type_map = _get_ac_type_map(ukdale_path) def _ukdale_measurement_mapping_func(house_id, chan_id): ac_type = ac_type_map[(house_id, chan_id)][0] return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert 6-second data _convert(ukdale_path, store, _ukdale_measurement_mapping_func, TZ, sort_index=False) store.close() # Add metadata if format == 'HDF': convert_yaml_to_hdf5(join(ukdale_path, 'metadata'), output_filename) # Convert 1-second data store.open(mode='a') _convert_one_sec_data(ukdale_path, store, ac_type_map) store.close() print("Done converting UK-DALE to HDF5!")
def convert_unifei(redd_path, output_filename, format='HDF'): """ Parameters ---------- redd_path : str The root path of the REDD low_freq dataset. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' """ def _redd_measurement_mapping_func(house_id, chan_id): ac_type = 'active' return [('power', ac_type)] # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(redd_path, store, _redd_measurement_mapping_func, 'America/Sao_Paulo') print("Done convert...") #Aqui é necessário colocar o endereço de onde fica a metadata print(get_module_directory()) s=join(get_module_directory(), 'dataset_converters', 'unifei', 'metadata') print(s) # Add metadata # Aqui também é necessário colocar o endereço correto da metadata save_yaml_to_datastore(join(get_module_directory(), 'dataset_converters', 'unifei', 'metadata'), store) store.close() print("Done converting REDD to HDF5!")
def convert_iawe(iawe_path, output_filename, format="HDF"): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. output_filename : str The destination filename (including path and suffix). """ check_directory_exists(iawe_path) # Open data store store = get_datastore(output_filename, format, mode='w') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 13): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename) df.drop_duplicates(subset=["timestamp"], inplace=True) df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) df = df.tz_convert(TIMEZONE) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.rename(columns=lambda x: column_mapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.convert_objects(convert_numeric=True) df = df.dropna() df = df.astype(np.float32) df = df.sort_index() store.put(str(key), df) store.close() convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting iAWE to HDF5!")
def convert_gjw(gjw_path, output_filename): """ Parameters ---------- gjw_path : str The root path of the gjw dataset. output_filename : str The destination filename (including path and suffix), will default if not specified directory and file structure nilm_gjw_data building<1> elec 4-POWER_REAL_FINE <date> Dump.csv 5-POWER_REACTIVE_STANDARD <date> Dump.csv ... ... building<n> HDF5 nilm_gjw_data.hdf5 metadata building1.yaml dataset.yaml meter_devices.yaml other files """ if gjw_path is None: gjw_path = home_dir check_directory_exists(gjw_path) os.chdir(gjw_path) gjw_path = os.getcwd() # sort out potential issue with slashes or backslashes if output_filename is None: output_filename =join(home_dir,'HDF5','nilm_gjw_data.hdf5') # Open data store print( 'opening datastore', output_filename) store = get_datastore(output_filename, format, mode='w') # walk the directory tree from the dataset home directory #clear dataframe & add column headers df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME]) found = False for current_dir, _, files in os.walk(gjw_path): #unused second parameter of for dirs_in_current_dir if current_dir.find('.git')!=-1 or current_dir.find('.ipynb') != -1: #print( 'Skipping ', current_dir) continue print( 'checking', current_dir) m = bld_re.search(current_dir) if m: #The csv files may be further down the tree so this section may be repeated building_name = m.group() building_nbr = int(bld_nbr_re.search(building_name).group()) meter_nbr = 1 key = Key(building=building_nbr, meter=meter_nbr) for items in fnmatch.filter(files, "4*.csv"): # process any .CSV files found found = True ds = iso_date_re.search(items).group() # print( 'found files for date:', ds,end=" ") # found files to process df1 = _read_file_pair(current_dir,ds) # read two csv files into a dataframe df = pd.concat([df,df1]) # concatenate the results into one long dataframe if found: found = False df = _prepare_data_for_toolkit(df) _summarise_dataframe(df,'Prepared for tool kit') store.put(str(key), df) #clear dataframe & add column headers #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME]) break # only 1 folder with .csv files at present store.close() convert_yaml_to_hdf5(join(gjw_path, 'metadata'),output_filename) print("Done converting gjw to HDF5!")
('power', 'reactive') ]) meter.set_index(('physical_quantity', 'type'), inplace=True, drop=True) meter.columns.set_names(('physical_quantity', 'type'), inplace=True) meter = meter.convert_objects(convert_numeric=True) meter = meter.dropna() meter = meter.astype(float) meter = meter.sort_index() # meter = meter.resample("1S") # meter = reindex_fill_na(meter, idx) assert meter.isnull().sum().sum() == 0 return meter pqenergy = get_datastore('pqenergy.h5', 'HDF', mode='w') pqenergy.put('/building1/elec/meter1', convert_pq('aggr_p', 'aggr_q')) pqenergy.put('/building1/elec/meter2', convert_pq('aircon_p', 'aircon_q')) pqenergy.put('/building1/elec/meter3', convert_pq('hdryer_p', 'hdryer_q')) pqenergy.put('/building1/elec/meter4', convert_pq('wboiler_p', 'wboiler_q')) pqenergy.put('/building1/elec/meter5', convert_pq('ecooker_p', 'ecooker_q')) pqenergy.put('/building1/elec/meter6', convert_pq('dehumid_p', 'dehumid_q')) pqenergy.put('/building1/elec/meter7', convert_pq('fridge_p', 'fridge_q')) pqenergy.put('/building1/elec/meter8', convert_pq('aheater_p', 'aheater_q')) pqenergy.put('/building1/elec/meter9', convert_pq('ciron_p', 'ciron_q')) pqenergy.put('/building1/elec/meter10', convert_pq('rcooker_p', 'rcooker_q')) pqenergy.put('/building1/elec/meter11', convert_pq('tv_p', 'tv_q')) pqenergy.put('/building1/elec/meter12', convert_pq('vhood_p', 'vhood_q')) pqenergy.put('/building1/elec/meter13', convert_pq('washer_p', 'washer_q')) save_yaml_to_datastore('metadata_pq/', pqenergy) pqenergy.close()
if isMedal: START_DATE = np.datetime64( '2017-05-12T11:08:28') - np.timedelta64('2', 'h') else: START_DATE = np.datetime64( '2017-05-12T11:08:46') - np.timedelta64('2', 'h') time_indices = [START_DATE] for i in range(1, num_rows): time_indices.append(time_indices[i-1] + np.timedelta64('1', 's')) return time_indices if not os.path.exists('../data/'): os.makedirs('../data/') store = get_datastore("../data/converted_sum.hdf5", 'HDF', mode='w') """ Gets CLEAR and MEDAL data and puts them into the store with the right key and instance numbers. """ frames = get_clear_data() for phase in range(1, 4): key = Key(building=1, meter=phase) print('Adding phase {}'.format(phase)) store.put(str(key), frames[phase-1]) for medal_id in range(1, 16): frames = get_summary_data(medal_id) for i in range(1, 7):
def convert_hes(data_dir, output_filename, format='HDF', max_chunks=None): metadata = { 'name': 'HES', 'geographic_coordinates': (51.464462, -0.076544), # London 'timezone': 'Europe/London' } # Open DataStore store = get_datastore(output_filename, format, mode='w') # load list of appliances hes_to_nilmtk_appliance_lookup = pd.read_csv( join(get_module_directory(), 'dataset_converters', 'hes', 'hes_to_nilmtk_appliance_lookup.csv')) # load list of houses hes_house_ids = load_list_of_house_ids(data_dir) nilmtk_house_ids = np.arange(1, len(hes_house_ids) + 1) hes_to_nilmtk_house_ids = dict(zip(hes_house_ids, nilmtk_house_ids)) # array of hes_house_codes: nilmtk_building_code = house_codes.index(hes_house_code) house_codes = [] # map house_appliance_codes = dict() # Create a temporary metadata dir original_metadata_dir = join(get_module_directory(), 'dataset_converters', 'hes', 'metadata') tmp_dir = tempfile.mkdtemp() metadata_dir = join(tmp_dir, 'metadata') shutil.copytree(original_metadata_dir, metadata_dir) print("Using temporary dir for metadata:", metadata_dir) # Iterate over files for filename in FILENAMES: # Load appliance energy data chunk-by-chunk full_filename = join(data_dir, filename) print('Loading', full_filename) try: reader = pd.read_csv(full_filename, names=COL_NAMES, index_col=False, chunksize=CHUNKSIZE) except IOError as e: print(e, file=stderr) continue # Iterate over chunks in file chunk_i = 0 for chunk in reader: if max_chunks is not None and chunk_i >= max_chunks: break print(' processing chunk', chunk_i, 'of', filename) # Convert date and time columns to np.datetime64 objects dt = chunk['date'] + ' ' + chunk['time'] del chunk['date'] del chunk['time'] chunk['datetime'] = pd.to_datetime(dt, format='%Y-%m-%d %H:%M:%S', utc=True) # Data is either tenths of a Wh or tenths of a degree chunk['data'] *= 10 chunk['data'] = chunk['data'].astype(np.float32) # Iterate over houses in chunk for hes_house_id, hes_house_id_df in chunk.groupby('house id'): if hes_house_id not in house_codes: house_codes.append(hes_house_id) if hes_house_id not in house_appliance_codes.keys(): house_appliance_codes[hes_house_id] = [] nilmtk_house_id = house_codes.index(hes_house_id) + 1 # Iterate over appliances in house for appliance_code, appliance_df in chunk.groupby( 'appliance code'): if appliance_code not in house_appliance_codes[ hes_house_id]: house_appliance_codes[hes_house_id].append( appliance_code) nilmtk_meter_id = house_appliance_codes[ hes_house_id].index(appliance_code) + 1 _process_meter_in_chunk(nilmtk_house_id, nilmtk_meter_id, hes_house_id_df, store, appliance_code) chunk_i += 1 print('houses with some data loaded:', house_appliance_codes.keys()) store.close() # generate building yaml metadata for hes_house_id in house_codes: nilmtk_building_id = house_codes.index(hes_house_id) + 1 building_metadata = {} building_metadata['instance'] = nilmtk_building_id building_metadata['original_name'] = int( hes_house_id) # use python int building_metadata['elec_meters'] = {} building_metadata['appliances'] = [] # initialise dict of instances of each appliance type instance_counter = {} for appliance_code in house_appliance_codes[hes_house_id]: nilmtk_meter_id = house_appliance_codes[hes_house_id].index( appliance_code) + 1 # meter metadata if appliance_code in MAINS_CODES: meter_metadata = { 'device_model': 'multivoies', 'site_meter': True } break elif appliance_code in CIRCUIT_CODES: meter_metadata = {'device_model': 'multivoies'} break elif appliance_code in TEMPERATURE_CODES: break else: # is appliance meter_metadata = {'device_model': 'wattmeter'} # only appliance meters at this point building_metadata['elec_meters'][nilmtk_meter_id] = meter_metadata # appliance metadata lookup_row = hes_to_nilmtk_appliance_lookup[ hes_to_nilmtk_appliance_lookup.Code == appliance_code].iloc[0] appliance_metadata = { 'original_name': lookup_row.Name, 'meters': [nilmtk_meter_id] } # appliance type appliance_metadata.update({'type': lookup_row.nilmtk_name}) # TODO appliance room # appliance instance number if instance_counter.get(lookup_row.nilmtk_name) == None: instance_counter[lookup_row.nilmtk_name] = 0 instance_counter[lookup_row.nilmtk_name] += 1 appliance_metadata['instance'] = instance_counter[ lookup_row.nilmtk_name] building_metadata['appliances'].append(appliance_metadata) building = 'building{:d}'.format(nilmtk_building_id) yaml_full_filename = join(metadata_dir, building + '.yaml') with open(yaml_full_filename, 'w') as outfile: #print(building_metadata) outfile.write(yaml.dump(building_metadata)) # write yaml metadata to hdf5 convert_yaml_to_hdf5(metadata_dir, output_filename) # remote the temporary dir when finished shutil.rmtree(tmp_dir)
def convert_deps(deps_path, input_filename, output_filename, format='HDF'): """ Parameters ---------- deps_path : str The root path of the DEPS dataset. e.g 'C:/data/deps' input_filename : str The rawdata filename (including path and suffix). e.g 'C:/data/rawdata.csv' output_filename : str The destination HDF5 filename (including path and suffix). e.g 'C:/data/deps/DEPS_data.h5' format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' Meters & Measurements : ---------- Measurement assignment (idMeasurement) in rawdata to REDD format Measurements id's Units Meters Name 14011 14012 --> W VAr --> Main_RST 14001 14007 14014 14017 --> V A W VAr --> Main_R 14002 14008 14015 14018 --> V A W VAr --> Main_S 14003 14009 14016 14019 --> V A W VAr --> Main_T 13001 --> W --> Lights_1 13002 --> W --> Lights_2 10003 10006 10014 10018 --> V A W VAr --> HVAC_1 10002 10005 10013 10017 --> V A W VAr --> HVAC_2 10001 10004 10012 10016 --> V A W VAr --> HVAC_4 21001 21002 21003 21005 --> V A W VAr --> Rack Example ---------- raw_data.csv (input_filename): -- idMeasurement, UNIX_timestamp(tStampUTC), dataValue 14011, 1583103600, 123 14012, 1583103600, -416 14011, 1583103601, 126 14012, 1583103601, -416 ... ... ... 14011, 1583535599, 121 14012, 1583535599, -411 Outputs REDD format: deps_path/classroom1/ : -- channel_1.dat: 1583103600 123 -416 1583103600 126 -416 ... ... ... 1583103600 121 -411 -- labels.dat: 1 Main_RST Output HDF5 file: output_filename.h5 """ #-------------------------------------------------------------------- # writed by Andrés Arias Silva # Raw data converter to REDD format extracted from DEPS SQL database _deps_to_redd_format(deps_path, input_filename) #-------------------------------------------------------------------- def _deps_measurement_mapping_func(classroom_id, chan_id): if chan_id == 1: meas = ([('power', 'active'), ('power', 'reactive')]) elif chan_id > 1 and chan_id <= 4: meas = ([('voltage', ''), ('current', ''), ('power', 'active'), ('power', 'reactive')]) elif chan_id > 4 and chan_id <= 6: meas = ([('power', 'active')]) elif chan_id > 6 and chan_id <= 10: meas = ([ ('voltage', ''), ('current', ''), ('power', 'active'), ('power', 'reactive'), ]) else: raise NameError('incorrect channel number') return meas # Open DataStore store = get_datastore(output_filename, format, mode='w') # Convert raw data to DataStore _convert(deps_path, store, _deps_measurement_mapping_func, 'Europe/Madrid') # s=join(get_module_directory(), # 'dataset_converters', # 'deps', # 'metadata') # Add metadata save_yaml_to_datastore( join(get_module_directory(), 'dataset_converters', 'deps', 'metadata'), store) store.close() print("Done converting DEPS data to HDF5!")
def convert_hes(data_dir, output_filename, format='HDF', max_chunks=None): metadata = { 'name': 'HES', 'geographic_coordinates': (51.464462,-0.076544), # London 'timezone': 'Europe/London' } # Open DataStore store = get_datastore(output_filename, format, mode='w') # load list of appliances hes_to_nilmtk_appliance_lookup = pd.read_csv(join(get_module_directory(), 'dataset_converters', 'hes', 'hes_to_nilmtk_appliance_lookup.csv')) # load list of houses hes_house_ids = load_list_of_house_ids(data_dir) nilmtk_house_ids = np.arange(1,len(hes_house_ids)+1) hes_to_nilmtk_house_ids = dict(zip(hes_house_ids, nilmtk_house_ids)) # array of hes_house_codes: nilmtk_building_code = house_codes.index(hes_house_code) house_codes = [] # map house_appliance_codes = dict() # Iterate over files for filename in FILENAMES: # Load appliance energy data chunk-by-chunk full_filename = join(data_dir, filename) print('loading', full_filename) try: reader = pd.read_csv(full_filename, names=COL_NAMES, index_col=False, chunksize=CHUNKSIZE) except IOError as e: print(e, file=stderr) continue # Iterate over chunks in file chunk_i = 0 for chunk in reader: if max_chunks is not None and chunk_i >= max_chunks: break print(' processing chunk', chunk_i, 'of', filename) # Convert date and time columns to np.datetime64 objects dt = chunk['date'] + ' ' + chunk['time'] del chunk['date'] del chunk['time'] chunk['datetime'] = dt.apply(datetime_converter) # Data is either tenths of a Wh or tenths of a degree chunk['data'] *= 10 chunk['data'] = chunk['data'].astype(np.float32) # Iterate over houses in chunk for hes_house_id, hes_house_id_df in chunk.groupby('house id'): if hes_house_id not in house_codes: house_codes.append(hes_house_id) if hes_house_id not in house_appliance_codes.keys(): house_appliance_codes[hes_house_id] = [] nilmtk_house_id = house_codes.index(hes_house_id)+1 # Iterate over appliances in house for appliance_code, appliance_df in chunk.groupby('appliance code'): if appliance_code not in house_appliance_codes[hes_house_id]: house_appliance_codes[hes_house_id].append(appliance_code) nilmtk_meter_id = house_appliance_codes[hes_house_id].index(appliance_code)+1 _process_meter_in_chunk(nilmtk_house_id, nilmtk_meter_id, hes_house_id_df, store, appliance_code) chunk_i += 1 print('houses with some data loaded:', house_appliance_codes.keys()) store.close() # generate building yaml metadata for hes_house_id in house_codes: nilmtk_building_id = house_codes.index(hes_house_id)+1 building_metadata = {} building_metadata['instance'] = nilmtk_building_id building_metadata['original_name'] = int(hes_house_id) # use python int building_metadata['elec_meters'] = {} building_metadata['appliances'] = [] # initialise dict of instances of each appliance type instance_counter = {} for appliance_code in house_appliance_codes[hes_house_id]: nilmtk_meter_id = house_appliance_codes[hes_house_id].index(appliance_code)+1 # meter metadata if appliance_code in MAINS_CODES: meter_metadata = {'device_model': 'multivoies', 'site_meter': True} break elif appliance_code in CIRCUIT_CODES: meter_metadata = {'device_model': 'multivoies'} break elif appliance_code in TEMPERATURE_CODES: break else: # is appliance meter_metadata = {'device_model': 'wattmeter'} # only appliance meters at this point building_metadata['elec_meters'][nilmtk_meter_id] = meter_metadata # appliance metadata lookup_row = hes_to_nilmtk_appliance_lookup[hes_to_nilmtk_appliance_lookup.Code==appliance_code].iloc[0] appliance_metadata = {'original_name': lookup_row.Name, 'meters': [nilmtk_meter_id] } # appliance type appliance_metadata.update({'type': lookup_row.nilmtk_name}) # TODO appliance room # appliance instance number if instance_counter.get(lookup_row.nilmtk_name) == None: instance_counter[lookup_row.nilmtk_name] = 0 instance_counter[lookup_row.nilmtk_name] += 1 appliance_metadata['instance'] = instance_counter[lookup_row.nilmtk_name] building_metadata['appliances'].append(appliance_metadata) building = 'building{:d}'.format(nilmtk_building_id) yaml_full_filename = join(_get_module_directory(), 'metadata', building + '.yaml') with open(yaml_full_filename, 'w') as outfile: #print(building_metadata) outfile.write(yaml.dump(building_metadata)) # write yaml metadata to hdf5 convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename)
f = open(wattsFile, 'rb') watts = pickle.load(f) f.close() f = open(timeStampsFile, 'rb') timeStamps = pickle.load(f) f.close() f = open(appliancesFile, 'rb') appliances = pickle.load(f) f.close() watts = np.array(watts) appliances = np.array(appliances) timeStamps = np.array(timeStamps) store = get_datastore(outputFilename, 'HDF', mode='w') # breakdown the data by appliance and set every time point where # the appliance wasnt used to 0 for instance, app in enumerate(np.unique(appliances)): # get the time points where a given appliance is on and # also where it is off appIndices = np.where(appliances == app)[0] nonAppIndices = np.where(appliances != app)[0] # keep only the data for when the appliance is on wattsFiltered = np.delete(np.copy(watts), nonAppIndices) timeFiltered = np.delete(np.copy(timeStamps), nonAppIndices) # create zeroed data when the appliance is off
def convert_gjw(gjw_path, output_filename): """ Parameters ---------- gjw_path : str The root path of the gjw dataset. output_filename : str The destination filename (including path and suffix), will default if not specified directory and file structure nilm_gjw_data building<1> elec 4-POWER_REAL_FINE <date> Dump.csv 5-POWER_REACTIVE_STANDARD <date> Dump.csv ... ... building<n> HDF5 nilm_gjw_data.hdf5 metadata building1.yaml dataset.yaml meter_devices.yaml other files """ if gjw_path is None: gjw_path = home_dir check_directory_exists(gjw_path) os.chdir(gjw_path) gjw_path = os.getcwd( ) # sort out potential issue with slashes or backslashes if output_filename is None: output_filename = join(home_dir, 'HDF5', 'nilm_gjw_data.hdf5') # Open data store print('opening datastore', output_filename) store = get_datastore(output_filename, format, mode='w') # walk the directory tree from the dataset home directory #clear dataframe & add column headers df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME, REACTIVE_COLUMN_NAME]) found = False for current_dir, _, files in os.walk(gjw_path): #unused second parameter of for dirs_in_current_dir if current_dir.find('.git') != -1 or current_dir.find('.ipynb') != -1: #print( 'Skipping ', current_dir) continue print('checking', current_dir) m = bld_re.search(current_dir) if m: #The csv files may be further down the tree so this section may be repeated building_name = m.group() building_nbr = int(bld_nbr_re.search(building_name).group()) meter_nbr = 1 key = Key(building=building_nbr, meter=meter_nbr) for items in fnmatch.filter(files, "4*.csv"): # process any .CSV files found found = True ds = iso_date_re.search(items).group() # print( 'found files for date:', ds,end=" ") # found files to process df1 = _read_file_pair(current_dir, ds) # read two csv files into a dataframe df = pd.concat( [df, df1]) # concatenate the results into one long dataframe if found: found = False df = _prepare_data_for_toolkit(df) _summarise_dataframe(df, 'Prepared for tool kit') store.put(str(key), df) #clear dataframe & add column headers #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME]) break # only 1 folder with .csv files at present store.close() convert_yaml_to_hdf5(join(gjw_path, 'metadata'), output_filename) print("Done converting gjw to HDF5!")
def convert_sortd(input_path, output_filename, format='HDF'): """Converts the dataset to NILMTK HDF5 format. For more information about the SOR test dataset, contact Samuel Marisa. Parameters ---------- input_path : str The root path of the dataset. It is assumed that the YAML metadata is in 'input_path/metadata'. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' Example usage: -------------- convert('/sortd', 'store.h5') """ print( 'Attempting to convert the SORTD dataset at %s into %s in NILMTK %s format...' % (input_path, output_filename, format)) # Ensure that the input directory exists check_directory_exists(input_path) # Load the dataset metadata with open(join(input_path, 'metadata/dataset.yaml'), 'r') as stream: dataset_metadata = yaml.load(stream) # Open the datastore store = get_datastore(output_filename, format, mode='w') # Iterate through all building metadata files found in the dataset for metadata_file in glob.glob( join(input_path, 'metadata/building[0-9]*.yaml')): # Load the building metadata with open(metadata_file, 'r') as stream: metadata = yaml.load(stream) building_id = int(metadata['instance']) print('==> Loading building %d defined at %s. Please wait...' % (building_id, metadata_file)) for meter_id, meter_data in metadata['elec_meters'].items(): meter_id = int(meter_id) key = Key(building=building_id, meter=meter_id) # Load the raw data from the data location print(' - Loading meter %s from %s...' % (meter_id, meter_data['data_location'])) columns = [('power', 'active')] df = pd.read_csv(join(input_path, meter_data['data_location']), sep=',', names=columns, dtype={m: np.float32 for m in columns}) # Convert the timestamp index column to timezone-aware datetime df.index = pd.to_datetime(df.index.values, unit='s', utc=True) df = df.tz_convert(dataset_metadata['timezone']) #df = pd.read_csv(join(input_path, db_file), sep=';', names=('Datetime', 'P1', 'P2', 'P3'), dtype={'P1': np.float64, 'P2': np.float64, 'P3': np.float64}, parse_dates=[1]) print(df.info()) print(df.head()) #print(df.tail()) print(" - Storing data under key %s in the datastore..." % (str(key))) store.put(str(key), df) print(" - Building %s loaded!" % (building_id)) print("Adding the metadata into the store...") save_yaml_to_datastore(join(input_path, 'metadata'), store) print("Closing the store...") store.close() print("Done converting SORTD dataset to HDF5!")