def refresh_gjw_metadata(gjw_path, output_filename): """ Parameters ---------- gjw_path : str The root path of the gjw dataset. output_filename : str The destination filename (including path and suffix), will default if not specified directory and file structure nilm_gjw_data building<1> elec 4-POWER_REAL_FINE <date> Dump.csv 5-POWER_REACTIVE_STANDARD <date> Dump.csv ... ... building<n> HDF5 nilm_gjw_data.hdf5 metadata building1.yaml dataset.yaml meter_devices.yaml other files """ if gjw_path is None: gjw_path = home_dir check_directory_exists(gjw_path) os.chdir(gjw_path) gjw_path = os.getcwd( ) # sort out potential issue with slashes or backslashes if output_filename is None: output_filename = join(home_dir, 'HDF5', 'nilm_gjw_data.hdf5') convert_yaml_to_hdf5(join(gjw_path, 'metadata'), output_filename) print("Done refreshing metadata")
def convert_combed(combed_path, hdf_filename): """ Parameters ---------- combed_path : str The root path of the combed dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open HDF5 file store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') chan = 1 for building, meter_array in SUBMETER_PATHS.iteritems(): for meter in meter_array: key = Key(building=1, meter=chan) dfs = [] total = pd.DataFrame() for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building, str(meter), "%s.csv" %attribute ) print(filename_attribute) dfs.append(pd.read_csv(filename_attribute, parse_dates = True, index_col = 0, header = True, names=[attribute])) total = pd.concat(dfs, axis = 1) total.rename(columns=lambda x: column_mapping[x], inplace=True) total.columns.set_names(LEVEL_NAMES, inplace=True) store.put(str(key), total, format='table') store.flush() chan = chan+ 1 convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), hdf_filename) print("Done converting COMBED to HDF5!")
def refresh_gjw_metadata(gjw_path, output_filename): """ Parameters ---------- gjw_path : str The root path of the gjw dataset. output_filename : str The destination filename (including path and suffix), will default if not specified directory and file structure nilm_gjw_data building<1> elec 4-POWER_REAL_FINE <date> Dump.csv 5-POWER_REACTIVE_STANDARD <date> Dump.csv ... ... building<n> HDF5 nilm_gjw_data.hdf5 metadata building1.yaml dataset.yaml meter_devices.yaml other files """ if gjw_path is None: gjw_path = home_dir check_directory_exists(gjw_path) os.chdir(gjw_path) gjw_path = os.getcwd() # sort out potential issue with slashes or backslashes if output_filename is None: output_filename =join(home_dir,'HDF5','nilm_gjw_data.hdf5') convert_yaml_to_hdf5(join(gjw_path, 'metadata'),output_filename) print("Done refreshing metadata")
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') for building_name, building_mapping in overall_dataset_mapping.iteritems(): for load_name, load_mapping in building_mapping.iteritems(): for load_mapping_path, meter_number in load_mapping.iteritems(): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute) print(filename_attribute) dfs.append(pd.read_csv(filename_attribute, parse_dates=True, index_col=0, header=True, names=[attribute])) total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.rename(columns=lambda x: column_mapping[x], inplace=True) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting COMBED to HDF5!")
def convert_ampds(input_path, output_filename, format='HDF'): """ Convert AMPds R2013 as seen on Dataverse. Download the files as CSVs and put them in the `input_path` folder for conversion. Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO Parameters: ----------- input_path: str The path of the directory where all the csv files are supposed to be stored output_filename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. format: str Defaults to HDF5 Example usage: -------------- convert('/AMPds/electricity', 'store.h5') """ check_directory_exists(input_path) files = [ f for f in listdir(input_path) if isfile(join(input_path, f)) and '.csv' in f and '.swp' not in f ] # Sorting Lexicographically files.sort() # Remove Whole Home and put it at top files.remove("WHE.csv") files.insert(0, "WHE.csv") assert isdir(input_path) store = get_datastore(output_filename, format, mode='w') for i, csv_file in enumerate(files): key = Key(building=1, meter=(i + 1)) print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...') df = pd.read_csv(join(input_path, csv_file)) # Due to fixed width, column names have spaces :( df.columns = [x.replace(" ", "") for x in df.columns] df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit='s', utc=True) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df = df.tz_convert(TIMEZONE) df.rename(columns=lambda x: columnNameMapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) store.put(str(key), df) print("Done with file #", (i + 1)) store.close() metadata_path = join(_get_module_directory(), 'metadata') print('Processing metadata...') convert_yaml_to_hdf5(metadata_path, output_filename)
def _convert(input_path, data_store, measurement_mapping_func, sort_index=True, drop_duplicates=False): meter_to_machine = { 1: "MainTerminal", 2: "ChipPress", 3: "ChipSaw", 4: "HighTemperatureOven", 5: "PickAndPlaceUnit", 6: "ScreenPrinter", 7: "SolderingOven", 8: "VacuumOven", 9: "VacuumPump1", 10: "VacuumPump2", 11: "WashingMachine", } check_directory_exists(input_path) print("Loading factory 1...", end="... ") chans = _find_all_channels(input_path, meter_to_machine) for chan_id, filename in chans.items(): print(chan_id, end=" ") stdout.flush() key = Key(building=1, meter=chan_id) measurements = measurement_mapping_func(chan_id) df = _load_csv(filename, measurements, sort_index=sort_index, drop_duplicates=drop_duplicates) data_store.put(str(key), df) print()
def test_all_datasets(directory): print("Testing all data sets started at: {}".format(time.now())) print("-"*60) check_directory_exists(directory) datasets = [f for f in listdir(directory) if isfile(join(directory, f)) and '.h5' in f and '.swp' not in f] for dataset in datasets: test_single_dataset(dataset)
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') any_file_converted = False for building_name, building_mapping in iteritems(overall_dataset_mapping): for load_name, load_mapping in iteritems(building_mapping): for load_mapping_path, meter_number in iteritems(load_mapping): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute) if not os.path.isfile(filename_attribute): # File not found directly in the combed_path provided # Try adding 'iiitd' to it filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute) if os.path.isfile(filename_attribute): exists = True print(filename_attribute) df = pd.read_csv(filename_attribute, names=["timestamp", attribute]) df.index = pd.to_datetime(df["timestamp"], unit='ms') df = df.drop("timestamp", 1) dfs.append(df) else: exists = False if exists: total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns]) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) any_file_converted = True if not any_file_converted: raise RuntimeError('No files converted, did you specify the correct path?') convert_yaml_to_hdf5( join(get_module_directory(), 'dataset_converters', 'combed', 'metadata'), output_filename ) print("Done converting COMBED to HDF5!")
def _convert(input_path, store, tz, sort_index=True): """ Parameters ---------- input_path : str The root path of the REFIT dataset. store : DataStore The NILMTK DataStore object. measurement_mapping_func : function Must take these parameters: - house_id - chan_id Function should return a list of tuples e.g. [('power', 'active')] tz : str Timezone e.g. 'US/Eastern' sort_index : bool """ check_directory_exists(input_path) # Iterate though all houses and channels # house 14 is missing! houses = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21 ] nilmtk_house_id = 0 for house_id in houses: nilmtk_house_id += 1 print("Loading house", house_id, end="... ") stdout.flush() csv_filename = join(input_path, 'House_' + str(house_id) + '.csv') # The clean version already includes header, so we # just skip the text version of the timestamp usecols = [ 'Unix', 'Aggregate', 'Appliance1', 'Appliance2', 'Appliance3', 'Appliance4', 'Appliance5', 'Appliance6', 'Appliance7', 'Appliance8', 'Appliance9' ] df = _load_csv(csv_filename, usecols, tz) if sort_index: df = df.sort_index() # might not be sorted... chan_id = 0 for col in df.columns: chan_id += 1 print(chan_id, end=" ") stdout.flush() key = Key(building=nilmtk_house_id, meter=chan_id) chan_df = pd.DataFrame(df[col]) chan_df.columns = pd.MultiIndex.from_tuples([('power', 'active')]) # Modify the column labels to reflect the power measurements recorded. chan_df.columns.set_names(LEVEL_NAMES, inplace=True) store.put(str(key), chan_df) print('')
def convert_ampds(input_path, output_filename, format='HDF'): """ Convert AMPds R2013 as seen on Dataverse. Download the files as CSVs and put them in the `input_path` folder for conversion. Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO Parameters: ----------- input_path: str The path of the directory where all the csv files are supposed to be stored output_filename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. format: str Defaults to HDF5 Example usage: -------------- convert('/AMPds/electricity', 'store.h5') """ check_directory_exists(input_path) files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and '.csv' in f and '.swp' not in f] # Sorting Lexicographically files.sort() # Remove Whole Home and put it at top files.remove("WHE.csv") files.insert(0, "WHE.csv") assert isdir(input_path) store = get_datastore(output_filename, format, mode='w') for i, csv_file in enumerate(files): key = Key(building=1, meter=(i + 1)) print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...') df = pd.read_csv(join(input_path, csv_file)) # Due to fixed width, column names have spaces :( df.columns = [x.replace(" ", "") for x in df.columns] df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit='s', utc=True) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df = df.tz_convert(TIMEZONE) df.rename(columns=lambda x: columnNameMapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) store.put(str(key), df) print("Done with file #", (i + 1)) store.close() metadata_path = join(get_module_directory(), 'dataset_converters', 'ampds', 'metadata') print('Processing metadata...') convert_yaml_to_hdf5(metadata_path, output_filename)
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') any_file_converted = False for building_name, building_mapping in iteritems(overall_dataset_mapping): for load_name, load_mapping in iteritems(building_mapping): for load_mapping_path, meter_number in iteritems(load_mapping): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute) if not os.path.isfile(filename_attribute): # File not found directly in the combed_path provided # Try adding 'iiitd' to it filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute) if os.path.isfile(filename_attribute): exists = True print(filename_attribute) df = pd.read_csv(filename_attribute, names=["timestamp", attribute]) df.index = pd.to_datetime(df["timestamp"], unit='ms') df = df.drop("timestamp", 1) dfs.append(df) else: exists = False if exists: total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns]) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) any_file_converted = True if not any_file_converted: raise RuntimeError('No files converted, did you specify the correct path?') convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting COMBED to HDF5!")
def _convert(input_path, store, tz, sort_index=True): """ Parameters ---------- input_path : str The root path of the REFIT dataset. store : DataStore The NILMTK DataStore object. measurement_mapping_func : function Must take these parameters: - house_id - chan_id Function should return a list of tuples e.g. [('power', 'active')] tz : str Timezone e.g. 'US/Eastern' sort_index : bool """ check_directory_exists(input_path) # Iterate though all houses and channels # house 14 is missing! houses = [1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,21] nilmtk_house_id = 0 for house_id in houses: nilmtk_house_id += 1 print("Loading house", house_id, end="... ") stdout.flush() csv_filename = join(input_path, 'House_' + str(house_id) + '.csv') # The clean version already includes header, so we # just skip the text version of the timestamp usecols = ['Unix','Aggregate','Appliance1','Appliance2','Appliance3','Appliance4','Appliance5','Appliance6','Appliance7','Appliance8','Appliance9'] df = _load_csv(csv_filename, usecols, tz) if sort_index: df = df.sort_index() # might not be sorted... chan_id = 0 for col in df.columns: chan_id += 1 print(chan_id, end=" ") stdout.flush() key = Key(building=nilmtk_house_id, meter=chan_id) chan_df = pd.DataFrame(df[col]) chan_df.columns = pd.MultiIndex.from_tuples([('power', 'active')]) # Modify the column labels to reflect the power measurements recorded. chan_df.columns.set_names(LEVEL_NAMES, inplace=True) store.put(str(key), chan_df) print('')
def _convert(input_path, store, measurement_mapping_func, tz, sort_index=True, drop_duplicates=False): """ Parameters ---------- input_path : str The root path of the DEPS dataset. store : DataStore The NILMTK DataStore object. measurement_mapping_func : function Must take these parameters: - classroom_id - chan_id Function should return a list of tuples e.g. [('power', 'active')] tz : str Timezone e.g. 'US/Eastern' sort_index : bool Defaults to True drop_duplicates : bool Remove entries with duplicated timestamp (keeps the first value) Defaults to False for backwards compatibility. """ check_directory_exists(input_path) # Iterate though all classrooms and channels classrooms = _find_all_classrooms(input_path) for classroom_id in classrooms: print("Loading data from 'Aula 2.2 Bis' to classroom N°", classroom_id, end=" ... Loading channels ") stdout.flush() chans = _find_all_chans(input_path, classroom_id) for chan_id in chans: print(chan_id, end=" ") stdout.flush() key = Key(building=classroom_id, meter=chan_id) measurements = measurement_mapping_func(classroom_id, chan_id) csv_filename = _get_csv_filename(input_path, key) df = _load_csv(csv_filename, measurements, tz, sort_index=sort_index, drop_duplicates=drop_duplicates) store.put(str(key), df) print()
def convert_ampds(inputPath, hdfFilename): ''' Parameters: ----------- inputPath: str The path of the directory where all the csv files are supposed to be stored hdfFilename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. Example usage: -------------- convert('/AMPds/electricity', 'store.h5') ''' check_directory_exists(inputPath) files = [f for f in listdir(inputPath) if isfile(join(inputPath, f)) and '.csv' in f and '.swp' not in f] # Sorting Lexicographically files.sort() print(files) # Remove Whole Home and put it at top files.remove("WHE.csv") files.insert(0, "WHE.csv") assert isdir(inputPath) store = HDFStore(hdfFilename) for i, csv_file in enumerate(files): key = Key(building=1, meter=(i + 1)) print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...') df = pd.read_csv(join(inputPath, csv_file)) # Due to fixed width, column names have spaces :( df.columns = [x.replace(" ", "") for x in df.columns] df.index = pd.to_datetime(df["TIMESTAMP"], unit='s', utc = True) df = df.drop('TIMESTAMP', 1) df = df.tz_localize('GMT').tz_convert('America/Vancouver') df.rename(columns=lambda x: columnNameMapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.convert_objects(convert_numeric=True) df = df.dropna() df = df.astype(np.float32) store.put(str(key), df, format='Table') store.flush() print("Done with file #", (i + 1)) store.close() metadataPath = join(_get_module_directory(), 'metadata') print('Processing metadata...') convert_yaml_to_hdf5(metadataPath, hdfFilename)
def convert_ampds(input_path, output_filename, format="HDF"): """ Parameters: ----------- input_path: str The path of the directory where all the csv files are supposed to be stored output_filename: str The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work. format: str Defaults to HDF5 Example usage: -------------- convert('/AMPds/electricity', 'store.h5') """ check_directory_exists(input_path) files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and ".csv" in f and ".swp" not in f] # Sorting Lexicographically files.sort() # Remove Whole Home and put it at top files.remove("WHE.csv") files.insert(0, "WHE.csv") assert isdir(input_path) store = get_datastore(output_filename, format, mode="w") for i, csv_file in enumerate(files): key = Key(building=1, meter=(i + 1)) print("Loading file #", (i + 1), " : ", csv_file, ". Please wait...") df = pd.read_csv(join(input_path, csv_file)) # Due to fixed width, column names have spaces :( df.columns = [x.replace(" ", "") for x in df.columns] df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit="s", utc=True) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df = df.tz_localize("GMT").tz_convert(TIMEZONE) df.rename(columns=lambda x: columnNameMapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.convert_objects(convert_numeric=True) df = df.dropna() df = df.astype(np.float32) store.put(str(key), df) print("Done with file #", (i + 1)) store.close() metadata_path = join(_get_module_directory(), "metadata") print("Processing metadata...") convert_yaml_to_hdf5(metadata_path, output_filename)
def convert_combed(combed_path, output_filename, format='HDF'): """ Parameters ---------- combed_path : str The root path of the combed dataset. output_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(combed_path) # Open store store = get_datastore(output_filename, format, mode='w') for building_name, building_mapping in iteritems(overall_dataset_mapping): for load_name, load_mapping in iteritems(building_mapping): for load_mapping_path, meter_number in iteritems(load_mapping): building_number = building_number_mapping[building_name] key = Key(building=building_number, meter=meter_number) dfs = [] for attribute in column_mapping.keys(): filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" % attribute) if os.path.isfile(filename_attribute): exists = True print(filename_attribute) df = pd.read_csv(filename_attribute, header=True, names=["timestamp", attribute]) df.index = pd.to_datetime(df["timestamp"], unit='ms') df = df.drop("timestamp", 1) dfs.append(df) else: exists = False if exists: total = pd.concat(dfs, axis=1) total = total.tz_localize('UTC').tz_convert('Asia/Kolkata') total.rename(columns=lambda x: column_mapping[x], inplace=True) total.columns.set_names(LEVEL_NAMES, inplace=True) assert total.index.is_unique store.put(str(key), total) convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting COMBED to HDF5!")
def convert_iawe(iawe_path, output_filename, format="HDF"): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. output_filename : str The destination filename (including path and suffix). """ check_directory_exists(iawe_path) idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ) idx = idx.tz_localize('GMT').tz_convert(TIMEZONE) # Open data store store = get_datastore(output_filename, format, mode='w') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 12): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename, dtype=np.float64, na_values='\\N') df.drop_duplicates(subset=["timestamp"], inplace=True) df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) df = df.tz_convert(TIMEZONE) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.columns = pd.MultiIndex.from_tuples( [column_mapping[x] for x in df.columns], names=LEVEL_NAMES) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) df = df.sort_index() df = df.resample("1T").mean() df = reindex_fill_na(df, idx) assert df.isnull().sum().sum() == 0 store.put(str(key), df) store.close() metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe', 'metadata') convert_yaml_to_hdf5(metadata_dir, output_filename) print("Done converting iAWE to HDF5!")
def convert_iawe(iawe_path, output_filename, format="HDF"): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. output_filename : str The destination filename (including path and suffix). """ check_directory_exists(iawe_path) idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ) idx = idx.tz_localize('GMT').tz_convert(TIMEZONE) # Open data store store = get_datastore(output_filename, format, mode='w') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 12): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename, dtype=np.float64, na_values='\\N') df.drop_duplicates(subset=["timestamp"], inplace=True) df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) df = df.tz_convert(TIMEZONE) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.rename(columns=lambda x: column_mapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') df = df.dropna() df = df.astype(np.float32) df = df.sort_index() df = df.resample("1T").mean() df = reindex_fill_na(df, idx) assert df.isnull().sum().sum() == 0 store.put(str(key), df) store.close() metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe', 'metadata') convert_yaml_to_hdf5(metadata_dir, output_filename) print("Done converting iAWE to HDF5!")
def _convert_to_datastore(input_path, store, tz): check_directory_exists(input_path) homes = _get_all_homes(input_path) for home in homes: home_id = int(re.search("home_([\d]*).csv", home).group(1)) csv_filename = join(input_path, home) dtype_dict = {m: np.float32 for m in MAJOR_LOAD} dtype_dict[TIME_INDEX] = pd.datetime whole_df = pd.read_csv(csv_filename, index_col=TIME_INDEX, dtype=dtype_dict) del whole_df.index.name print ("processing ", home_id, end="... ") for meter in MAJOR_LOAD: meter_id = int(MAJOR_LOAD.index(meter))+1 table_key = Key(building=home_id, meter=meter_id) table_df = _load_csv(whole_df, meter, tz) table_df.sort_index() store.put(str(table_key), table_df) print (meter, end=" ") print ("finished", end="!") print ()
def _convert(input_path, hdf_filename, measurement_mapping_func, tz): """ Parameters ---------- input_path : str The root path of the REDD low_freq dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). measurement_mapping_func : function Must take these parameters: - house_id - chan_id Function should return a list of tuples e.g. [('power', 'active')] tz : str Timezone e.g. 'US/Eastern' """ check_directory_exists(input_path) # Open HDF5 file store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') # Iterate though all houses and channels houses = _find_all_houses(input_path) for house_id in houses: print("Loading house", house_id, end="... ") stdout.flush() chans = _find_all_chans(input_path, house_id) for chan_id in chans: print(chan_id, end=" ") stdout.flush() key = Key(building=house_id, meter=chan_id) measurements = measurement_mapping_func(house_id, chan_id) csv_filename = _get_csv_filename(input_path, key) df = _load_csv(csv_filename, measurements, tz) df = df.sort_index() # raw REDD data isn't always sorted store.put(str(key), df, format='table') store.flush() print() store.close()
def convert_iawe(iawe_path, hdf_filename): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ check_directory_exists(iawe_path) # Open HDF5 file store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 13): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename) df.index = pd.to_datetime( (df.timestamp.values * 1E9).astype(int), utc=True) df = df.tz_convert('Asia/Kolkata') df = df.drop('timestamp', 1) df.rename(columns=lambda x: column_mapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.convert_objects(convert_numeric=True) df = df.dropna() df = df.astype(np.float32) df = df.sort_index() store.put(str(key), df, format='table') store.flush() store.close() convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), hdf_filename) print("Done converting iAWE to HDF5!")
def convert_iawe(iawe_path, output_filename, format="HDF"): """ Parameters ---------- iawe_path : str The root path of the iawe dataset. output_filename : str The destination filename (including path and suffix). """ check_directory_exists(iawe_path) # Open data store store = get_datastore(output_filename, format, mode='w') electricity_path = join(iawe_path, "electricity") # Mains data for chan in range(1, 13): key = Key(building=1, meter=chan) filename = join(electricity_path, "%d.csv" % chan) print('Loading ', chan) df = pd.read_csv(filename) df.drop_duplicates(subset=["timestamp"], inplace=True) df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) df = df.tz_convert(TIMEZONE) df = df.drop(TIMESTAMP_COLUMN_NAME, 1) df.rename(columns=lambda x: column_mapping[x], inplace=True) df.columns.set_names(LEVEL_NAMES, inplace=True) df = df.convert_objects(convert_numeric=True) df = df.dropna() df = df.astype(np.float32) df = df.sort_index() store.put(str(key), df) store.close() convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'), output_filename) print("Done converting iAWE to HDF5!")
def _convert(input_path, store, measurement_mapping_func, tz, sort_index=True): """ Parameters ---------- input_path : str The root path of the LAB dataset. store : DataStore The NILMTK DataStore object. measurement_mapping_func : function Must take these parameters: - house_id - chan_id Function should return a list of tuples e.g. [('power', 'active')] tz : str Timezone e.g. 'America/Fortaleza' sort_index : bool """ check_directory_exists(input_path) # Iterate though all houses and channels houses = _find_all_houses(input_path) for house_id in houses: print("Loading house", house_id, end="... ") stdout.flush() chans = _find_all_chans(input_path, house_id) for chan_id in chans: print(chan_id, end=" ") stdout.flush() key = Key(building=house_id, meter=chan_id) measurements = measurement_mapping_func(house_id, chan_id) csv_filename = _get_csv_filename(input_path, key) df = _load_csv(csv_filename, measurements, tz) if sort_index: df = df.sort_index() # raw LAB data isn't always sorted store.put(str(key), df) print()
def _convert(input_path, store, measurement_mapping_func, tz, sort_index=True): """ Parameters ---------- input_path : str The root path of the REDD low_freq dataset. store : DataStore The NILMTK DataStore object. measurement_mapping_func : function Must take these parameters: - house_id - chan_id Function should return a list of tuples e.g. [('power', 'active')] tz : str Timezone e.g. 'US/Eastern' sort_index : bool """ check_directory_exists(input_path) # Iterate though all houses and channels houses = _find_all_houses(input_path) for house_id in houses: print("Loading house", house_id, end="... ") stdout.flush() chans = _find_all_chans(input_path, house_id) for chan_id in chans: print(chan_id, end=" ") stdout.flush() key = Key(building=house_id, meter=chan_id) measurements = measurement_mapping_func(house_id, chan_id) csv_filename = _get_csv_filename(input_path, key) df = _load_csv(csv_filename, measurements, tz) if sort_index: df = df.sort_index() # raw REDD data isn't always sorted store.put(str(key), df) print()
def convert_gjw(gjw_path, output_filename): """ Parameters ---------- gjw_path : str The root path of the gjw dataset. output_filename : str The destination filename (including path and suffix), will default if not specified directory and file structure nilm_gjw_data building<1> elec 4-POWER_REAL_FINE <date> Dump.csv 5-POWER_REACTIVE_STANDARD <date> Dump.csv ... ... building<n> HDF5 nilm_gjw_data.hdf5 metadata building1.yaml dataset.yaml meter_devices.yaml other files """ if gjw_path is None: gjw_path = home_dir check_directory_exists(gjw_path) os.chdir(gjw_path) gjw_path = os.getcwd() # sort out potential issue with slashes or backslashes if output_filename is None: output_filename =join(home_dir,'HDF5','nilm_gjw_data.hdf5') # Open data store print( 'opening datastore', output_filename) store = get_datastore(output_filename, format, mode='w') # walk the directory tree from the dataset home directory #clear dataframe & add column headers df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME]) found = False for current_dir, _, files in os.walk(gjw_path): #unused second parameter of for dirs_in_current_dir if current_dir.find('.git')!=-1 or current_dir.find('.ipynb') != -1: #print( 'Skipping ', current_dir) continue print( 'checking', current_dir) m = bld_re.search(current_dir) if m: #The csv files may be further down the tree so this section may be repeated building_name = m.group() building_nbr = int(bld_nbr_re.search(building_name).group()) meter_nbr = 1 key = Key(building=building_nbr, meter=meter_nbr) for items in fnmatch.filter(files, "4*.csv"): # process any .CSV files found found = True ds = iso_date_re.search(items).group() # print( 'found files for date:', ds,end=" ") # found files to process df1 = _read_file_pair(current_dir,ds) # read two csv files into a dataframe df = pd.concat([df,df1]) # concatenate the results into one long dataframe if found: found = False df = _prepare_data_for_toolkit(df) _summarise_dataframe(df,'Prepared for tool kit') store.put(str(key), df) #clear dataframe & add column headers #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME]) break # only 1 folder with .csv files at present store.close() convert_yaml_to_hdf5(join(gjw_path, 'metadata'),output_filename) print("Done converting gjw to HDF5!")
def convert_eco(dataset_loc, hdf_filename, timezone): """ Parameters: ----------- dataset_loc: str The root directory where the dataset is located. hdf_filename: str The location where the hdf_filename is present. The directory location has to contain the hdf5file name for the converter to work. timezone: str specifies the timezone of the dataset. """ # Creating a new HDF File store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc') check_directory_exists(dataset_loc) directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i] directory_list.sort() print(directory_list) found_any_sm = False found_any_plug = False # Traversing every folder for folder in directory_list: if folder[0] == '.' or folder[-3:] == '.h5': print('Skipping ', folder) continue #Building number and meter_flag building_no = int(folder[:2]) meter_flag = None if 'sm_csv' in folder: meter_flag = 'sm' elif 'plugs' in folder: meter_flag = 'plugs' else: print('Skipping folder', folder) continue print('Computing for folder', folder) dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))] dir_list.sort() if meter_flag == 'plugs' and len(dir_list) < 3: # Try harder to find the subfolders folder = join(folder, folder[:2]) dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))] print('Current dir list:', dir_list) for fl in dir_list: print('Computing for folder ', fl) fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i] fl_dir_list.sort() if meter_flag == 'sm': for fi in fl_dir_list: found_any_sm = True df = pd.read_csv(join(dataset_loc,folder,fl,fi), names=[i for i in range(1,17)], dtype=np.float32) for phase in range(1,4): key = str(Key(building=building_no, meter=phase)) df_phase = df.loc[:,[1+phase, 5+phase, 8+phase, 13+phase]] # get reactive power power = df_phase.loc[:, (1+phase, 13+phase)].values reactive = power[:,0] * np.tan(power[:,1] * np.pi / 180) df_phase['Q'] = reactive df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT') df_phase = df_phase.tz_convert(timezone) sm_column_name = { 1+phase:('power', 'active'), 5+phase:('current', ''), 8+phase:('voltage', ''), 13+phase:('phase_angle', ''), 'Q': ('power', 'reactive'), } df_phase.columns = pd.MultiIndex.from_tuples([ sm_column_name[col] for col in df_phase.columns ]) power_active = df_phase['power', 'active'] tmp_before = np.size(power_active) df_phase = df_phase[power_active != -1] power_active = df_phase['power', 'active'] tmp_after = np.size(power_active) if tmp_before != tmp_after: print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) df_phase.columns.set_names(LEVEL_NAMES, inplace=True) if not key in store: store.put(key, df_phase, format='Table') else: store.append(key, df_phase, format='Table') store.flush() print('Building', building_no, ', Meter no.', phase, '=> Done for ', fi[:-4]) else: #Meter number to be used in key meter_num = int(fl) + 3 key = str(Key(building=building_no, meter=meter_num)) current_folder = join(dataset_loc,folder,fl) if not fl_dir_list: raise RuntimeError("No CSV file found in " + current_folder) #Getting dataframe for each csv file seperately for fi in fl_dir_list: found_any_plug = True df = pd.read_csv(join(current_folder, fi), names=[1], dtype=np.float64) df.index = pd.DatetimeIndex(start=fi[:-4].replace('.', ':'), freq='s', periods=86400, tz = 'GMT') df.columns = pd.MultiIndex.from_tuples(plugs_column_name.values()) df = df.tz_convert(timezone) df.columns.set_names(LEVEL_NAMES, inplace=True) tmp_before = np.size(df.power.active) df = df[df.power.active != -1] tmp_after = np.size(df.power.active) if (tmp_before != tmp_after): print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) # If table not present in hdf5, create or else append to existing data if not key in store: store.put(key, df, format='Table') print('Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4]) else: store.append(key, df, format='Table') store.flush() print('Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4]) if not found_any_plug or not found_any_sm: raise RuntimeError('The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)') print("Data storage completed.") store.close() # Adding the metadata to the HDF5file print("Proceeding to Metadata conversion...") meta_path = join( get_module_directory(), 'dataset_converters', 'eco', 'metadata' ) convert_yaml_to_hdf5(meta_path, hdf_filename) print("Completed Metadata conversion.")
def test_all(path_to_directory): ''' path_to_directory: Contains the h5 files on which the tests are supposed to be run ''' check_directory_exists(path_to_directory) #files=[f for f in listdir(path_to_directory) and '.h5' in f and '.swp' not in f] files = [f for f in listdir(path_to_directory) if isfile(join(path_to_directory, f)) and '.h5' in f and '.swp' not in f] files.sort() print ("Datasets collected and sorted. Processing...") try: for i, file in enumerate(files): current_file=DataSet(join(path_to_directory, file)) print ("Printing metadata for current file...done.") print_dict(current_file.metadata) print (" Loading file # ", i, " : ", file, ". Please wait.") for building_number in range(1, len(current_file.buildings)+1): #Examine metadata for a single house elec=current_file.buildings[building_number].elec print ("The dataset being processed is : ", elec.dataset()) print ("Metadata for current file: ") print_dict(current_file.buildings[building_number].metadata) print ("Appliance label information: ", elec.appliance_label()) #print (elec.appliances) print ("Appliances:- ") for i in elec.appliances: print (i) print ("Examining sub-metered appliances...") print ("Collecting stats on meters...Done.") print (elec._collect_stats_on_all_meters) print ("Timeframe: ", elec.get_timeframe()) print ("Available power AC types: ", elec.available_power_ac_types()) print ("Clearing cache...done.") elec.clear_cache() print ("Testing if there are meters from multiple buildings. Result returned by method: ", elec.contains_meters_from_multiple_buildings()) # TODO: Find a better way to test the correlation function # print ("Testing the correlation function. ", elec.correlation(elec)) print ("List of disabled meters: ", elec.disabled_meters) print ("Trying to determine the dominant appliance: ") try: elec.dominant_appliance() except RuntimeError: print ('''More than one dominant appliance in MeterGroup! (The dominant appliance per meter should be manually specified in the metadata. If it isn't and if there are multiple appliances for a meter then NILMTK assumes all appliances on that meter are dominant. NILMTK can't automatically distinguish between multiple appliances on the same meter (at least, not without using NILM!))''') pass print ("Dropout rate: ", elec.dropout_rate()) try: print ("Calculating energy per meter:") print (elec.energy_per_meter()) print ("Calculating total entropy") print (elec.entropy()) print ("Calculating entropy per meter: ") print (elec.entropy_per_meter()) except ValueError: print ("ValueError: Total size of array must remain unchanged.") pass print ("Calculating fraction per meter.") print (elec.fraction_per_meter()) #print ("Average energy per period: ", elec.average_energy_per_period()) print ("Executing functions...") lis=[] func="" '''for function in dir(elec): try: start=time.time() if ("__" not in function or "dataframe_of_meters" not in function): func=getattr(elec, function) print ("Currently executing ", function, ". Please wait...") print (func()) # print ("cProfile stats - printed") # cProfile.run("func") end=time.time() print ("Time taken for the entire process : ", (end - start)) except AttributeError: print ("Attribute error occured. ") except TypeError: lis.append(function) print ("Warning: TypeError") pass''' print ("Plotting wiring hierarchy of meters....") elec.draw_wiring_graph() ## DISAGGREGATION STARTS HERE appliance_type="unknown" #TODO : appliance_type should cycle through all appliances and check for each of them. For this, use a list. selected_appliance=nilmtk.global_meter_group.select_using_appliances(type=appliance_type) appliance_restricted = MeterGroup(selected_appliance.meters) if ((appliance_restricted.proportion_of_upstream_total_per_meter()) is not None): proportion_per_appliance = appliance_restricted.proportion_of_upstream_total_per_meter() proportion_per_appliance.plot(kind='bar'); plt.title('Appliance energy as proportion of total building energy'); plt.ylabel('Proportion'); plt.xlabel('Appliance (<appliance instance>, <building instance>, <dataset name>)'); selected_appliance.select(building=building_number).total_energy() selected_appliance.select(building=1).plot(); appliance_restricted = MeterGroup(selected_appliance.meters) daily_energy = pd.DataFrame([meter.average_energy_per_period(offset_alias='D') for meter in appliance_restricted.meters]) daily_energy.plot(kind='hist'); plt.title('Histogram of daily energy'); plt.xlabel('energy (kWh)'); plt.ylabel('Occurences'); plt.legend().set_visible(False) current_file.store.window=TimeFrame(start='2012-04-01 00:00:00-05:00', end='2012-04-02 00:00:00-05:00') #elec.plot(); fraction = elec.submeters().fraction_per_meter().dropna() labels = elec.get_appliance_labels(fraction.index) plt.figure(figsize=(8,8)) fraction.plot(kind='pie', labels=labels); elec.select_using_appliances(category='heating') elec.select_using_appliances(category='single-phase induction motor') co = CombinatorialOptimisation() co.train(elec) for model in co.model: print_dict(model) disag_filename = join(data_dir, 'ampds-disag.h5') output = HDFDataStore(disag_filename, 'w') co.disaggregate(elec.mains(), output) output.close() disag = DataSet(disag_filename) disag_elec = disag.buildings[building_number].elec f1 = f1_score(disag_elec, elec) f1.index = disag_elec.get_appliance_labels(f1.index) f1.plot(kind='bar') plt.xlabel('appliance'); plt.ylabel('f-score'); disag_elec.plot() disag.store.close() except AttributeError: print ("AttributeError occured while executing. This means that the value returned by proportion_per_appliance = appliance_restricted.proportion_of_upstream_total_per_meter() is None") pass
def convert_eco(dataset_loc, hdf_filename, timezone): """ Parameters: ----------- dataset_loc: str The root directory where the dataset is located. hdf_filename: str The location where the hdf_filename is present. The directory location has to contain the hdf5file name for the converter to work. timezone: str specifies the timezone of the dataset. """ # Creating a new HDF File store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc') check_directory_exists(dataset_loc) directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i] directory_list.sort() print directory_list # Traversing every folder for folder in directory_list: if folder[0] == '.' or folder[-3:] == '.h5': print 'Skipping ', folder continue print 'Computing for folder',folder #Building number and meter_flag building_no = int(folder[:2]) meter_flag = 'sm' if 'sm_csv' in folder else 'plugs' dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))] dir_list.sort() print 'Current dir list:',dir_list for fl in dir_list: print 'Computing for folder ',fl fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i] fl_dir_list.sort() if meter_flag == 'sm': for fi in fl_dir_list: df = pd.read_csv(join(dataset_loc,folder,fl,fi), names=[i for i in range(1,17)], dtype=np.float32) for phase in range(1,4): key = str(Key(building=building_no, meter=phase)) df_phase = df.ix[:,[1+phase, 5+phase, 8+phase, 13+phase]] # get reactive power power = df_phase.as_matrix([1+phase, 13+phase]) reactive = power[:,0] * np.tan(power[:,1] * np.pi / 180) df_phase['Q'] = reactive df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT') df_phase = df_phase.tz_convert(timezone) sm_column_name = {1+phase:('power', 'active'), 5+phase:('current', ''), 8+phase:('voltage', ''), 13+phase:('phase_angle', ''), 'Q': ('power', 'reactive'), }; df_phase.rename(columns=sm_column_name, inplace=True) tmp_before = np.size(df_phase.power.active) df_phase = df_phase[df_phase.power.active != -1] tmp_after = np.size(df_phase.power.active) if (tmp_before != tmp_after): print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) df_phase.columns.set_names(LEVEL_NAMES, inplace=True) if not key in store: store.put(key, df_phase, format='Table') else: store.append(key, df_phase, format='Table') store.flush() print 'Building',building_no,', Meter no.',phase,'=> Done for ',fi[:-4] else: #Meter number to be used in key meter_num = int(fl) + 3 key = str(Key(building=building_no, meter=meter_num)) #Getting dataframe for each csv file seperately for fi in fl_dir_list: df = pd.read_csv(join(dataset_loc,folder,fl ,fi), names=[1], dtype=np.float64) df.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz = 'GMT') df.rename(columns=plugs_column_name, inplace=True) df = df.tz_convert(timezone) df.columns.set_names(LEVEL_NAMES, inplace=True) tmp_before = np.size(df.power.active) df = df[df.power.active != -1] tmp_after = np.size(df.power.active) if (tmp_before != tmp_after): print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) # If table not present in hdf5, create or else append to existing data if not key in store: store.put(key, df, format='Table') print 'Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4] else: store.append(key, df, format='Table') store.flush() print 'Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4] print "Data storage completed." store.close() # Adding the metadata to the HDF5file print "Proceeding to Metadata conversion..." meta_path = join(_get_module_directory(), 'metadata') convert_yaml_to_hdf5(meta_path, hdf_filename) print "Completed Metadata conversion."
def _convert(input_path, store, measurement_mapping_func, tz, sort_index=True): """ Parameters ---------- input_path : str The root path of the REDD low_freq dataset. store : DataStore The NILMTK DataStore object. measurement_mapping_func : function Must take these parameters: - house_id - chan_id Function should return a list of tuples e.g. [('power', 'active')] tz : str Timezone e.g. 'US/Eastern' sort_index : bool """ check_directory_exists(input_path) houses = _find_all_houses(input_path) years = [] # Iterating though all Homes b_cnt = 0 for house_id in houses: b_cnt = b_cnt + 1 print('Loading Home:', house_id, end='... ') stdout.flush() years = _find_year(input_path, house_id) meters_paths_csv = [] df_all_years = pd.DataFrame() for y in years: mains_df = pd.DataFrame() meters_paths_csv = _find_all_csv_paths(input_path, house_id, y) data_frames = [] if not meters_paths_csv: continue else: k = 1 for path in meters_paths_csv: # 1.Concat csv files of all meters in each year, to get all # appliances in 1 dataframe per year temp_df = pd.read_csv(path) if k == 1: k = 0 if 'use [kW]' in temp_df.columns: mains_df = temp_df['use [kW]'] elif 'Usage [kW]' in temp_df.columns: mains_df = temp_df['Usage [kW]'] if 'Date & Time' in temp_df.columns: date_time_df = temp_df['Date & Time'] # Preprocess/clean dataframe by removing unusabe columns temp_df = _preprocess_csv(temp_df) data_frames.append(temp_df) df_year = reduce( lambda left, right: left.join( right, lsuffix='_1', rsuffix='_2'), data_frames) # Add columns 'Date & Time' and 'use [kW]' df_year.insert(0, 'Date & Time', date_time_df) df_year.insert(1, 'use', mains_df) # Append all years data to 1 dataframe df_all_years = df_all_years.append(df_year, ignore_index=True, sort=False) # Change index to datetime format df_all_years['Date & Time'] = pd.to_datetime( df_all_years['Date & Time'], utc=True) df_all_years.set_index('Date & Time', inplace=True) df_all_years = df_all_years.tz_convert('US/Eastern') # Append key value pairs to DataStore chan_id = 0 for col in df_all_years.columns: chan_id += 1 print(chan_id, end=' ') stdout.flush() key = Key(building=b_cnt, meter=chan_id) chan_df = pd.DataFrame(df_all_years[col]) chan_df.columns = pd.MultiIndex.from_tuples([('power', 'active')]) chan_df.columns.set_names(LEVEL_NAMES, inplace=True) store.put(str(key), chan_df) print()
def convert_eco(dataset_loc, hdf_filename, timezone): """ Parameters: ----------- dataset_loc: str The root directory where the dataset is located. hdf_filename: str The location where the hdf_filename is present. The directory location has to contain the hdf5file name for the converter to work. timezone: str specifies the timezone of the dataset. """ # Creating a new HDF File store = pd.HDFStore(hdf_filename, 'w') check_directory_exists(dataset_loc) directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i] directory_list.sort() print directory_list # Traversing every folder for folder in directory_list: print 'Computing for folder',folder #Building number and meter_flag building_no = int(folder[:2]) meter_flag = 'sm' if 'sm_csv' in folder else 'plugs' dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))] dir_list.sort() print 'Current dir list:',dir_list for fl in dir_list: #Meter number to be used in key meter_num = 1 if meter_flag == 'sm' else int(fl) + 1 print 'Computing for Meter no.',meter_num fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i] fl_dir_list.sort() key = Key(building=building_no, meter=meter_num) for fi in fl_dir_list: #Getting dataframe for each csv file seperately df_fl = _get_df(join(dataset_loc,folder,fl),fi,meter_flag) df_fl.sort_index(ascending=True,inplace=True) df_fl = df_fl.tz_convert(timezone) # If table not present in hdf5, create or else append to existing data if not key in store: store.put(str(key), df_fl, format='Table') else: store.append(str(key), df_fl, format='Table') store.flush() print 'Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4] print "Data storage completed." store.close() # Adding the metadata to the HDF5file print "Proceeding to Metadata conversion..." meta_path = join(_get_module_directory(), 'metadata') convert_yaml_to_hdf5(meta_path, hdf_filename) print "Completed Metadata conversion."
def convert_eco(dataset_loc, hdf_filename, timezone): """ Parameters: ----------- dataset_loc: str The root directory where the dataset is located. hdf_filename: str The location where the hdf_filename is present. The directory location has to contain the hdf5file name for the converter to work. timezone: str specifies the timezone of the dataset. """ # Creating a new HDF File store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc') check_directory_exists(dataset_loc) directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i] directory_list.sort() print(directory_list) # Traversing every folder for folder in directory_list: if folder[0] == '.' or folder[-3:] == '.h5': print('Skipping ', folder) continue print('Computing for folder', folder) #Building number and meter_flag building_no = int(folder[:2]) meter_flag = 'sm' if 'sm_csv' in folder else 'plugs' dir_list = [ i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc, folder, i)) ] dir_list.sort() print('Current dir list:', dir_list) for fl in dir_list: print('Computing for folder ', fl) fl_dir_list = [ i for i in listdir(join(dataset_loc, folder, fl)) if '.csv' in i ] fl_dir_list.sort() if meter_flag == 'sm': for fi in fl_dir_list: df = pd.read_csv(join(dataset_loc, folder, fl, fi), names=[i for i in range(1, 17)], dtype=np.float32) for phase in range(1, 4): key = str(Key(building=building_no, meter=phase)) df_phase = df.ix[:, [ 1 + phase, 5 + phase, 8 + phase, 13 + phase ]] # get reactive power power = df_phase.as_matrix([1 + phase, 13 + phase]) reactive = power[:, 0] * np.tan( power[:, 1] * np.pi / 180) df_phase['Q'] = reactive df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT') df_phase = df_phase.tz_convert(timezone) sm_column_name = { 1 + phase: ('power', 'active'), 5 + phase: ('current', ''), 8 + phase: ('voltage', ''), 13 + phase: ('phase_angle', ''), 'Q': ('power', 'reactive'), } df_phase.rename(columns=sm_column_name, inplace=True) tmp_before = np.size(df_phase.power.active) df_phase = df_phase[df_phase.power.active != -1] tmp_after = np.size(df_phase.power.active) if (tmp_before != tmp_after): print( 'Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) df_phase.columns.set_names(LEVEL_NAMES, inplace=True) if not key in store: store.put(key, df_phase, format='Table') else: store.append(key, df_phase, format='Table') store.flush() print('Building', building_no, ', Meter no.', phase, '=> Done for ', fi[:-4]) else: #Meter number to be used in key meter_num = int(fl) + 3 key = str(Key(building=building_no, meter=meter_num)) #Getting dataframe for each csv file seperately for fi in fl_dir_list: df = pd.read_csv(join(dataset_loc, folder, fl, fi), names=[1], dtype=np.float64) df.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT') df.rename(columns=plugs_column_name, inplace=True) df = df.tz_convert(timezone) df.columns.set_names(LEVEL_NAMES, inplace=True) tmp_before = np.size(df.power.active) df = df[df.power.active != -1] tmp_after = np.size(df.power.active) if (tmp_before != tmp_after): print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) # If table not present in hdf5, create or else append to existing data if not key in store: store.put(key, df, format='Table') print('Building', building_no, ', Meter no.', meter_num, '=> Done for ', fi[:-4]) else: store.append(key, df, format='Table') store.flush() print('Building', building_no, ', Meter no.', meter_num, '=> Done for ', fi[:-4]) print("Data storage completed.") store.close() # Adding the metadata to the HDF5file print("Proceeding to Metadata conversion...") meta_path = join(_get_module_directory(), 'metadata') convert_yaml_to_hdf5(meta_path, hdf_filename) print("Completed Metadata conversion.")
def _convert(input_path, store, measurement_mapping_func, tz, valid_home_id, sort_index=True, drop_duplicates=False): """ Parameters ---------- input_path : str The root path of the ideal low_freq dataset. store : DataStore The NILMTK DataStore object. measurement_mapping_func : function Must take these parameters: - house_id - chan_id Function should return a list of tuples e.g. [('power', 'active')] tz : str Timezone e.g. 'US/Eastern' sort_index : bool Defaults to True drop_duplicates : bool Remove entries with duplicated timestamp (keeps the first value) Defaults to False for backwards compatibility. """ check_directory_exists(input_path) #each file containg mains/appliance is linked with a house and sensor id filename, houses, sensor, category = _find_all_houses_sensor( input_path, valid_home_id) assert (len(houses) == len(sensor)) for id in range(len(houses)): if (category[id] == 'electric-appliance'): stdout.flush() key = Key(building=houses[id], meter=int(sensor[id])) csv_filename = join(input_path, filename[id]) measurements = measurement_mapping_func(houses[id], sensor[id], category[id]) df = _load_csv(csv_filename, measurements, tz, sort_index=sort_index, drop_duplicates=drop_duplicates) store.put(str(key), df) elif (category[id] == 'electric-mains'): combined_meters = sensor[id].split('c') stdout.flush() key = Key(building=houses[id], meter=int(combined_meters[0])) csv_filename = join(input_path, filename[id]) measurements = measurement_mapping_func(houses[id], sensor[id], category[id]) df = _load_csv(csv_filename, measurements, tz, sort_index=sort_index, drop_duplicates=drop_duplicates) store.put(str(key), df) print("Instance number:" + str(id)) print("Loading for home id:" + str(houses[id]) + "and sensor id:" + sensor[id] + "........")
def convert_sortd(input_path, output_filename, format='HDF'): """Converts the dataset to NILMTK HDF5 format. For more information about the SOR test dataset, contact Samuel Marisa. Parameters ---------- input_path : str The root path of the dataset. It is assumed that the YAML metadata is in 'input_path/metadata'. output_filename : str The destination filename (including path and suffix). format : str format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF' Example usage: -------------- convert('/sortd', 'store.h5') """ print( 'Attempting to convert the SORTD dataset at %s into %s in NILMTK %s format...' % (input_path, output_filename, format)) # Ensure that the input directory exists check_directory_exists(input_path) # Load the dataset metadata with open(join(input_path, 'metadata/dataset.yaml'), 'r') as stream: dataset_metadata = yaml.load(stream) # Open the datastore store = get_datastore(output_filename, format, mode='w') # Iterate through all building metadata files found in the dataset for metadata_file in glob.glob( join(input_path, 'metadata/building[0-9]*.yaml')): # Load the building metadata with open(metadata_file, 'r') as stream: metadata = yaml.load(stream) building_id = int(metadata['instance']) print('==> Loading building %d defined at %s. Please wait...' % (building_id, metadata_file)) for meter_id, meter_data in metadata['elec_meters'].items(): meter_id = int(meter_id) key = Key(building=building_id, meter=meter_id) # Load the raw data from the data location print(' - Loading meter %s from %s...' % (meter_id, meter_data['data_location'])) columns = [('power', 'active')] df = pd.read_csv(join(input_path, meter_data['data_location']), sep=',', names=columns, dtype={m: np.float32 for m in columns}) # Convert the timestamp index column to timezone-aware datetime df.index = pd.to_datetime(df.index.values, unit='s', utc=True) df = df.tz_convert(dataset_metadata['timezone']) #df = pd.read_csv(join(input_path, db_file), sep=';', names=('Datetime', 'P1', 'P2', 'P3'), dtype={'P1': np.float64, 'P2': np.float64, 'P3': np.float64}, parse_dates=[1]) print(df.info()) print(df.head()) #print(df.tail()) print(" - Storing data under key %s in the datastore..." % (str(key))) store.put(str(key), df) print(" - Building %s loaded!" % (building_id)) print("Adding the metadata into the store...") save_yaml_to_datastore(join(input_path, 'metadata'), store) print("Closing the store...") store.close() print("Done converting SORTD dataset to HDF5!")
def convert_eco(dataset_loc, hdf_filename, timezone): """ Parameters: ----------- dataset_loc: str The root directory where the dataset is located. hdf_filename: str The location where the hdf_filename is present. The directory location has to contain the hdf5file name for the converter to work. timezone: str specifies the timezone of the dataset. """ # Creating a new HDF File store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc') check_directory_exists(dataset_loc) directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i] directory_list.sort() print(directory_list) found_any_sm = False found_any_plug = False # Traversing every folder for folder in directory_list: if folder[0] == '.' or folder[-3:] == '.h5': print('Skipping ', folder) continue #Building number and meter_flag building_no = int(folder[:2]) meter_flag = None if 'sm_csv' in folder: meter_flag = 'sm' elif 'plugs' in folder: meter_flag = 'plugs' else: print('Skipping folder', folder) continue print('Computing for folder', folder) dir_list = [ i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc, folder, i)) ] dir_list.sort() if meter_flag == 'plugs' and len(dir_list) < 3: # Try harder to find the subfolders folder = join(folder, folder[:2]) dir_list = [ i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc, folder, i)) ] print('Current dir list:', dir_list) for fl in dir_list: print('Computing for folder ', fl) fl_dir_list = [ i for i in listdir(join(dataset_loc, folder, fl)) if '.csv' in i ] fl_dir_list.sort() if meter_flag == 'sm': for fi in fl_dir_list: found_any_sm = True df = pd.read_csv(join(dataset_loc, folder, fl, fi), names=[i for i in range(1, 17)], dtype=np.float32) # SmartMeter for phase in range(1, 4): key = str(Key(building=building_no, meter=phase)) df_phase = df.loc[:, [ 1 + phase, 5 + phase, 8 + phase, 13 + phase ]] # get reactive power power = df_phase.loc[:, (1 + phase, 13 + phase)].values reactive = power[:, 0] * np.tan( power[:, 1] * np.pi / 180) df_phase['Q'] = reactive df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT') df_phase = df_phase.tz_convert(timezone) sm_column_name = { 1 + phase: ('power', 'active'), 5 + phase: ('current', ''), 8 + phase: ('voltage', ''), 13 + phase: ('phase_angle', ''), 'Q': ('power', 'reactive'), } df_phase.columns = pd.MultiIndex.from_tuples( sm_column_name[col] for col in df_phase.columns) power_active = df_phase['power', 'active'] tmp_before = np.size(power_active) df_phase = df_phase[power_active != -1] power_active = df_phase['power', 'active'] tmp_after = np.size(power_active) if tmp_before != tmp_after: print( 'Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) df_phase.columns.set_names(LEVEL_NAMES, inplace=True) if not key in store: store.put(key, df_phase, format='Table') else: store.append(key, df_phase, format='Table') store.flush() print('Building', building_no, ', Meter no.', phase, '=> Done for ', fi[:-4]) # Plugs werden auch in Meter uebersetzt und dann aber direkt mit Appliances ergaenzt else: #Meter number to be used in key meter_num = int(fl) + 3 key = str(Key(building=building_no, meter=meter_num)) current_folder = join(dataset_loc, folder, fl) if not fl_dir_list: raise RuntimeError("No CSV file found in " + current_folder) #Getting dataframe for each csv file seperately for fi in fl_dir_list: found_any_plug = True df = pd.read_csv(join(current_folder, fi), names=[1], dtype=np.float64) df.index = pd.DatetimeIndex(start=fi[:-4].replace( '.', ':'), freq='s', periods=86400, tz='GMT') df.columns = pd.MultiIndex.from_tuples( plugs_column_name.values()) df = df.tz_convert(timezone) df.columns.set_names(LEVEL_NAMES, inplace=True) # Check whether measurements removed tmp_before = np.size(df.power.active) df = df[df.power.active != -1] tmp_after = np.size(df.power.active) if (tmp_before != tmp_after): print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after)) # If table not present in hdf5, create or else append to existing data if not key in store: store.put(key, df, format='Table') print('Building', building_no, ', Meter no.', meter_num, '=> Done for ', fi[:-4]) else: store.append(key, df, format='Table') store.flush() print('Building', building_no, ', Meter no.', meter_num, '=> Done for ', fi[:-4]) if not found_any_plug or not found_any_sm: raise RuntimeError( 'The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)' ) print("Data storage completed.") store.close() # Adding the metadata to the HDF5file print("Proceeding to Metadata conversion...") meta_path = join(get_module_directory(), 'dataset_converters', 'eco', 'metadata') convert_yaml_to_hdf5(meta_path, hdf_filename) print("Completed Metadata conversion.")
def convert_gjw(gjw_path, output_filename): """ Parameters ---------- gjw_path : str The root path of the gjw dataset. output_filename : str The destination filename (including path and suffix), will default if not specified directory and file structure nilm_gjw_data building<1> elec 4-POWER_REAL_FINE <date> Dump.csv 5-POWER_REACTIVE_STANDARD <date> Dump.csv ... ... building<n> HDF5 nilm_gjw_data.hdf5 metadata building1.yaml dataset.yaml meter_devices.yaml other files """ if gjw_path is None: gjw_path = home_dir check_directory_exists(gjw_path) os.chdir(gjw_path) gjw_path = os.getcwd( ) # sort out potential issue with slashes or backslashes if output_filename is None: output_filename = join(home_dir, 'HDF5', 'nilm_gjw_data.hdf5') # Open data store print('opening datastore', output_filename) store = get_datastore(output_filename, format, mode='w') # walk the directory tree from the dataset home directory #clear dataframe & add column headers df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME, REACTIVE_COLUMN_NAME]) found = False for current_dir, _, files in os.walk(gjw_path): #unused second parameter of for dirs_in_current_dir if current_dir.find('.git') != -1 or current_dir.find('.ipynb') != -1: #print( 'Skipping ', current_dir) continue print('checking', current_dir) m = bld_re.search(current_dir) if m: #The csv files may be further down the tree so this section may be repeated building_name = m.group() building_nbr = int(bld_nbr_re.search(building_name).group()) meter_nbr = 1 key = Key(building=building_nbr, meter=meter_nbr) for items in fnmatch.filter(files, "4*.csv"): # process any .CSV files found found = True ds = iso_date_re.search(items).group() # print( 'found files for date:', ds,end=" ") # found files to process df1 = _read_file_pair(current_dir, ds) # read two csv files into a dataframe df = pd.concat( [df, df1]) # concatenate the results into one long dataframe if found: found = False df = _prepare_data_for_toolkit(df) _summarise_dataframe(df, 'Prepared for tool kit') store.put(str(key), df) #clear dataframe & add column headers #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME]) break # only 1 folder with .csv files at present store.close() convert_yaml_to_hdf5(join(gjw_path, 'metadata'), output_filename) print("Done converting gjw to HDF5!")