Example #1
0
def _convert_one_sec_data(ukdale_path, store, ac_type_map):
    ids_of_one_sec_data = [
        identifier for identifier, ac_types in ac_type_map.iteritems()
        if ac_types == ['active', 'apparent']
    ]

    if not ids_of_one_sec_data:
        return

    for identifier in ids_of_one_sec_data:
        key = Key(building=identifier[0], meter=identifier[1])
        print("Loading 1-second data for", key, "...")
        house_path = 'house_{:d}'.format(key.building)
        filename = join(ukdale_path, house_path, 'mains.dat')
        df = _load_csv(filename, ONE_SEC_COLUMNS, TZ)
        store.put(store, str(key), df)

        # Set 'disabled' metadata attributes
        # TODO: needs to use `nilmtk.DataStore` API rather than grabbing
        # the `pd.HDFStore` directly.
        group = store.store._handle.get_node('/building{:d}'.format(
            key.building))
        metadata = group._f_getattr('metadata')
        metadata['elec_meters'][key.meter]['disabled'] = True
        group._f_setattr('metadata', metadata)
        store.store.flush()

    store.close()
Example #2
0
def _convert(input_path,
             data_store,
             measurement_mapping_func,
             sort_index=True,
             drop_duplicates=False):
    meter_to_machine = {
        1: "MainTerminal",
        2: "ChipPress",
        3: "ChipSaw",
        4: "HighTemperatureOven",
        5: "PickAndPlaceUnit",
        6: "ScreenPrinter",
        7: "SolderingOven",
        8: "VacuumOven",
        9: "VacuumPump1",
        10: "VacuumPump2",
        11: "WashingMachine",
    }

    check_directory_exists(input_path)

    print("Loading factory 1...", end="... ")
    chans = _find_all_channels(input_path, meter_to_machine)
    for chan_id, filename in chans.items():
        print(chan_id, end=" ")
        stdout.flush()
        key = Key(building=1, meter=chan_id)
        measurements = measurement_mapping_func(chan_id)
        df = _load_csv(filename,
                       measurements,
                       sort_index=sort_index,
                       drop_duplicates=drop_duplicates)

        data_store.put(str(key), df)
    print()
Example #3
0
def convert_ampds(input_path, output_filename, format='HDF'):
    """
    Convert AMPds R2013 as seen on Dataverse. Download the files
    as CSVs and put them in the `input_path` folder for conversion.
    
    Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO
    
    Parameters: 
    -----------
    input_path: str
            The path of the directory where all the csv 
            files are supposed to be stored
    output_filename: str
            The path of the h5 file where all the 
            standardized data is supposed to go. The path 
            should refer to a particular file and not just a
             random directory in order for this to work.
    format: str
        Defaults to HDF5
    Example usage:
    --------------
    convert('/AMPds/electricity', 'store.h5')    

    """
    check_directory_exists(input_path)
    files = [
        f for f in listdir(input_path)
        if isfile(join(input_path, f)) and '.csv' in f and '.swp' not in f
    ]
    # Sorting Lexicographically
    files.sort()

    # Remove Whole Home and put it at top
    files.remove("WHE.csv")
    files.insert(0, "WHE.csv")
    assert isdir(input_path)
    store = get_datastore(output_filename, format, mode='w')
    for i, csv_file in enumerate(files):
        key = Key(building=1, meter=(i + 1))
        print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...')
        df = pd.read_csv(join(input_path, csv_file))
        # Due to fixed width, column names have spaces :(
        df.columns = [x.replace(" ", "") for x in df.columns]
        df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME],
                                  unit='s',
                                  utc=True)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df = df.tz_convert(TIMEZONE)
        df.rename(columns=lambda x: columnNameMapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        store.put(str(key), df)
        print("Done with file #", (i + 1))

    store.close()
    metadata_path = join(_get_module_directory(), 'metadata')
    print('Processing metadata...')
    convert_yaml_to_hdf5(metadata_path, output_filename)
Example #4
0
def _convert(input_path, store, tz, sort_index=True):
    """
    Parameters
    ----------
    input_path : str
        The root path of the REFIT dataset.
    store : DataStore
        The NILMTK DataStore object.
    measurement_mapping_func : function
        Must take these parameters:
            - house_id
            - chan_id
        Function should return a list of tuples e.g. [('power', 'active')]
    tz : str 
        Timezone e.g. 'US/Eastern'
    sort_index : bool
    """

    check_directory_exists(input_path)

    # Iterate though all houses and channels
    # house 14 is missing!
    houses = [
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21
    ]
    nilmtk_house_id = 0
    for house_id in houses:
        nilmtk_house_id += 1
        print("Loading house", house_id, end="... ")
        stdout.flush()
        csv_filename = join(input_path, 'House_' + str(house_id) + '.csv')
        # The clean version already includes header, so we
        # just skip the text version of the timestamp
        usecols = [
            'Unix', 'Aggregate', 'Appliance1', 'Appliance2', 'Appliance3',
            'Appliance4', 'Appliance5', 'Appliance6', 'Appliance7',
            'Appliance8', 'Appliance9'
        ]

        df = _load_csv(csv_filename, usecols, tz)
        if sort_index:
            df = df.sort_index()  # might not be sorted...
        chan_id = 0
        for col in df.columns:
            chan_id += 1
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=nilmtk_house_id, meter=chan_id)

            chan_df = pd.DataFrame(df[col])
            chan_df.columns = pd.MultiIndex.from_tuples([('power', 'active')])

            # Modify the column labels to reflect the power measurements recorded.
            chan_df.columns.set_names(LEVEL_NAMES, inplace=True)

            store.put(str(key), chan_df)
        print('')
Example #5
0
def convert_combed(combed_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    output_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open store
    store = get_datastore(output_filename, format, mode='w')

    any_file_converted = False
    
    for building_name, building_mapping in iteritems(overall_dataset_mapping):
        for load_name, load_mapping in iteritems(building_mapping):
            for load_mapping_path, meter_number in iteritems(load_mapping):
                building_number = building_number_mapping[building_name]
                key = Key(building=building_number, meter=meter_number)
                dfs = []
                for attribute in column_mapping.keys():
                    filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    if not os.path.isfile(filename_attribute):
                        # File not found directly in the combed_path provided
                        # Try adding 'iiitd' to it
                        filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    
                    if os.path.isfile(filename_attribute):
                        exists = True
                        print(filename_attribute)
                        df = pd.read_csv(filename_attribute, names=["timestamp", attribute])
                        df.index = pd.to_datetime(df["timestamp"], unit='ms')
                        df = df.drop("timestamp", 1)
                        dfs.append(df)
                    else:
                        exists = False
                        
                if exists:
                    total = pd.concat(dfs, axis=1)
                    total = total.tz_localize('UTC').tz_convert('Asia/Kolkata')
                    total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns])
                    total.columns.set_names(LEVEL_NAMES, inplace=True)
                    assert total.index.is_unique
                    store.put(str(key), total)
                    any_file_converted = True
                    
    if not any_file_converted:
        raise RuntimeError('No files converted, did you specify the correct path?')
                    
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         output_filename)

    print("Done converting COMBED to HDF5!")
Example #6
0
def _process_meter_in_chunk(nilmtk_house_id, meter_id, chunk, store,
                            appliance_code):
    data = chunk['data'].values
    index = chunk['datetime']
    df = pd.DataFrame(data=data, index=index)
    df.columns = pd.MultiIndex.from_tuples([('power', 'active')])

    # Modify the column labels to reflect the power measurements recorded.
    df.columns.set_names(LEVEL_NAMES, inplace=True)
    df = df.sort_index()

    key = Key(building=nilmtk_house_id, meter=meter_id)
    store.append(str(key), df)
Example #7
0
def _convert(csv_filename, store, tz, sort_index=True):
    """
    Parameters
    ----------
    csv_filename : str
        The csv_filename that will be loaded. Must end with .csv
    store : DataStore
        The NILMTK DataStore object.
    measurement_mapping_func : function
        Must take these parameters:
            - house_id
            - chan_id
        Function should return a list of tuples e.g. [('power', 'apparent')]
    tz : str 
        Timezone e.g. 'Europe/Amsterdam'
    sort_index : bool
    """

    # Iterate though all houses and channels
    houses = [1]
    nilmtk_house_id = 0
    for house_id in houses:
        nilmtk_house_id += 1
        print("Loading house", house_id, end="... ")
        stdout.flush()

        usecols = [
            'Timestamp', 'mains', 'television', 'fan', 'fridge',
            'laptop computer', 'electric heating element', 'oven', 'unknown',
            'washing machine', 'microwave', 'toaster', 'sockets', 'cooker'
        ]
        df = _load_csv(csv_filename, usecols, 3, tz)

        if sort_index:
            df = df.sort_index()  # might not be sorted...
        chan_id = 0
        for col in df.columns:
            chan_id += 1
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=nilmtk_house_id, meter=chan_id)

            chan_df = pd.DataFrame(df[col])
            chan_df.columns = pd.MultiIndex.from_tuples([('power', 'apparent')
                                                         ])

            # Modify the column labels to reflect the power measurements recorded.
            chan_df.columns.set_names(LEVEL_NAMES, inplace=True)
            store.put(str(key), chan_df)
        print('')
def _convert(input_path,
             store,
             measurement_mapping_func,
             tz,
             sort_index=True,
             drop_duplicates=False):
    """
    Parameters
    ----------
    input_path : str
        The root path of the DEPS dataset.
    store : DataStore
        The NILMTK DataStore object.
    measurement_mapping_func : function
        Must take these parameters:
            - classroom_id
            - chan_id
        Function should return a list of tuples e.g. [('power', 'active')]
    tz : str 
        Timezone e.g. 'US/Eastern'
    sort_index : bool
        Defaults to True
    drop_duplicates : bool
        Remove entries with duplicated timestamp (keeps the first value)
        Defaults to False for backwards compatibility.
    """

    check_directory_exists(input_path)

    # Iterate though all classrooms and channels
    classrooms = _find_all_classrooms(input_path)
    for classroom_id in classrooms:
        print("Loading data from 'Aula 2.2 Bis' to classroom N°",
              classroom_id,
              end=" ... Loading channels ")
        stdout.flush()
        chans = _find_all_chans(input_path, classroom_id)
        for chan_id in chans:
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=classroom_id, meter=chan_id)
            measurements = measurement_mapping_func(classroom_id, chan_id)
            csv_filename = _get_csv_filename(input_path, key)
            df = _load_csv(csv_filename,
                           measurements,
                           tz,
                           sort_index=sort_index,
                           drop_duplicates=drop_duplicates)
            store.put(str(key), df)
        print()
Example #9
0
def convert_combed(combed_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    output_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open store
    store = get_datastore(output_filename, format, mode='w')

    for building_name, building_mapping in iteritems(overall_dataset_mapping):
        for load_name, load_mapping in iteritems(building_mapping):
            for load_mapping_path, meter_number in iteritems(load_mapping):
                building_number = building_number_mapping[building_name]
                key = Key(building=building_number, meter=meter_number)
                dfs = []
                for attribute in column_mapping.keys():
                    filename_attribute = join(combed_path, building_name,
                                              load_name, load_mapping_path,
                                              "%s.csv" % attribute)
                    if os.path.isfile(filename_attribute):
                        exists = True
                        print(filename_attribute)
                        df = pd.read_csv(filename_attribute,
                                         header=True,
                                         names=["timestamp", attribute])
                        df.index = pd.to_datetime(df["timestamp"], unit='ms')
                        df = df.drop("timestamp", 1)
                        dfs.append(df)
                    else:
                        exists = False
                if exists:
                    total = pd.concat(dfs, axis=1)
                    total = total.tz_localize('UTC').tz_convert('Asia/Kolkata')
                    total.rename(columns=lambda x: column_mapping[x],
                                 inplace=True)
                    total.columns.set_names(LEVEL_NAMES, inplace=True)
                    assert total.index.is_unique
                    store.put(str(key), total)
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         output_filename)

    print("Done converting COMBED to HDF5!")
Example #10
0
def convert_greend(greend_path, hdf_filename):
    """
    Parameters
    ----------
    greend_path : str
        The root path of the greend dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """


    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')
    houses = sorted(__get_houses(greend_path))
    print(houses)
    h = 1
    for house in houses:
        print('loading '+house+"'s house...")
        abs_house = join(greend_path, house)
        dates = [d for d in listdir(abs_house) if d.startswith('dataset')]
        house_data = pd.DataFrame()
        for date in dates:
            print('-----------------------',date)
            tmp_pandas = pd.DataFrame.from_csv(join(abs_house, date))
            tmp_pandas = tmp_pandas[tmp_pandas.index != 'timestamp']
            tmp_pandas = tmp_pandas.sort_index()
            c = 0 
            tmp_pandas.index = [__timestamp(t) for t in tmp_pandas.index]
            house_data = house_data.append(tmp_pandas)

            #for testing metadata files:
            #break
        m = 1 


        for meter in house_data:
            print("meter" + str(m)+': ')
            key = Key(building = h, meter=m)
            print("Putting into store...")
            store.put(str(key), house_data[meter], format = 'table')
            m += 1
            print('Flushing store...')
            store.flush()
        h += 1

    store.close()

    #needs to be edited
    convert_yaml_to_hdf5('/path/to/metadata', hdf_filename)
Example #11
0
def _convert_one_sec_data(ukdale_path, store, ac_type_map, drop_duplicates):
    ids_of_one_sec_data = [
        identifier for identifier, ac_types in iteritems(ac_type_map)
        if ac_types == ['active', 'apparent']]

    if not ids_of_one_sec_data:
        return

    for identifier in ids_of_one_sec_data:
        key = Key(building=identifier[0], meter=identifier[1])
        print("Loading 1-second data for", key, "...")
        house_path = 'house_{:d}'.format(key.building)
        filename = join(ukdale_path, house_path, 'mains.dat')
        df = _load_csv(filename, ONE_SEC_COLUMNS, TZ, drop_duplicates=drop_duplicates)
        store.put(str(key), df)

    store.close()
Example #12
0
def convert_iawe(iawe_path, output_filename, format="HDF"):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    output_filename : str
        The destination filename (including path and suffix).
    """

    check_directory_exists(iawe_path)
    idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ)
    idx = idx.tz_localize('GMT').tz_convert(TIMEZONE)

    # Open data store
    store = get_datastore(output_filename, format, mode='w')
    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 12):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename, dtype=np.float64, na_values='\\N')
        df.drop_duplicates(subset=["timestamp"], inplace=True)
        df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True)
        df = df.tz_convert(TIMEZONE)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df.columns = pd.MultiIndex.from_tuples(
            [column_mapping[x] for x in df.columns], names=LEVEL_NAMES)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        df = df.resample("1T").mean()
        df = reindex_fill_na(df, idx)
        assert df.isnull().sum().sum() == 0
        store.put(str(key), df)
    store.close()

    metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe',
                        'metadata')
    convert_yaml_to_hdf5(metadata_dir, output_filename)

    print("Done converting iAWE to HDF5!")
Example #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('inpath', help='input directory (ANTgen output)', nargs='?', default='../output')
    parser.add_argument('outfile', help='output file (HDF5 file)', nargs='?', default='../output/ANTgen.h5')
    args = parser.parse_args()

    if not os.path.exists('metadata') or not os.path.isfile('metadata/building1.yaml'):
        print("No metadata found. Please run 'generate_metadata.py' before using this tool...")
        exit(1)

    print("Converting ANTgen output from '{}' to file '{}'".format(args.inpath, args.outfile))

    with open('metadata/building1.yaml', 'r') as f:
        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)

    channel_list = ['total']  # pre-populate with aggregate data (total.csv)
    for app in yaml_dict['appliances']:
        channel_list.append(app['original_name'])

    store = get_datastore(args.outfile, 'HDF', mode='w')

    for i, app_name in enumerate(channel_list):
        print("Adding virtual meter ID {:02d}: {}".format(1+i, app_name))
        key = Key(building=1, meter=(i + 1))

        csvfile = os.path.join(args.inpath, str(app_name)+'.csv')
        try:
            df = pd.read_csv(csvfile, sep=';', encoding='utf-8', index_col=0)
            df.columns = pd.MultiIndex.from_tuples([('power', 'active') for x in df.columns], names=LEVEL_NAMES)
            df.index = pd.to_datetime(df.index)

            tz_naive = df.index
            tz_aware = tz_naive.tz_localize(tz='Europe/Vienna', ambiguous=True, nonexistent=pd.Timedelta('1H'))
            df.index = tz_aware

            df = df.tz_convert('Europe/Vienna')

            store.put(str(key), df)
        except FileNotFoundError:
            print("Input file '{}' not found - your HDF5 file will be incomplete!".format(csvfile))
            continue

    print('Adding metadata...')
    convert_yaml_to_hdf5('metadata/', args.outfile)
Example #14
0
def convert_caxe(file_path):
    '''
    Parameters
    ------------
    Takes input csv_file name to be tested as string.
    Data columns of the csv should contain following the following values in columns:
    timestamp,reactive_power,apparent_power,current,frequency,voltage,active_power) 
    Converts it into hdf5 Format and save as test.h5.
    '''
    df = pd.read_csv(f'{file_path}',
                     names=['timestamp', 'R', 'A', 'C', 'F', 'V', 'T'])
    column_mapping = {
        'F': ('frequency', ""),
        'V': ('voltage', ""),
        'T': ('power', 'active'),
        'C': ('current', ''),
        'R': ('power', 'reactive'),
        'A': ('power', 'apparent'),
    }

    output_filename = 'test.h5'

    # Open data store
    store = get_datastore(output_filename, format='HDF', mode='w')
    key = Key(building=1, meter=1)
    print('Loading ', 1)
    df.index = pd.to_datetime(df.timestamp.values)
    df = df.tz_convert(
        TIMEZONE)  #  if error occurs use tz_localize for tz naive timestamps
    df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
    df.index = pd.to_datetime(df.index.values)
    df.columns = pd.MultiIndex.from_tuples(
        [column_mapping[x] for x in df.columns], names=LEVEL_NAMES)
    df = df.apply(pd.to_numeric, errors='ignore')
    df = df.dropna()
    df = df.astype(np.float32)
    df = df.sort_index()
    df = df.resample("1T").mean()
    assert df.isnull().sum().sum() == 0
    store.put(str(key), df)
    store.close()
    convert_yaml_to_hdf5('./metadata', output_filename)

    print("Done converting test data to HDF5!")
Example #15
0
def _convert(input_path, store, tz):
    """
    Parameters
    ----------
    input_path : str
        The root path of the ENERTALK dataset.
    store : DataStore
        The NILMTK DataStore object.
    tz : str 
        Timezone e.g. 'Asia/Seoul'
    """
    house_list = [
        fname for fname in listdir(input_path) if not fname.startswith('.')
    ]

    date_count = 0

    for house in house_list:
        date_list = sorted(listdir(join(input_path, house)))
        date_count += len(date_list)

    with tqdm(total=date_count) as pbar:
        for house in house_list:
            date_list = sorted(listdir(join(input_path, house)))
            for date in date_list:
                fname_list = sorted(listdir(join(input_path, house, date)))

                for fname in fname_list:
                    file_path = join(input_path, house, date, fname)
                    chan_df = _load_parquet(file_path)
                    house_id = int(house) + 1
                    chan_id = int(fname.split('_')[0]) + 1
                    key = Key(building=house_id, meter=chan_id)
                    chan_df.columns = pd.MultiIndex.from_tuples([
                        ('power', 'active'), ('power', 'reactive')
                    ])
                    chan_df.columns.set_names(LEVEL_NAMES, inplace=True)

                    if str(key) in store._keys():
                        store.append(str(key), chan_df)
                    else:
                        store.put(str(key), chan_df)
                pbar.update(1)
Example #16
0
def convert(inputPath, hdfFilename, metadataPath='/'):
    '''
	Parameters: 
	-----------
	inputPath: str
		The path of the directory where all the csv files are supposed to be stored
	hdfFilename: str
		The path of the h5 file where all the standardized data is supposed to go. The path should refer to a particular file and not just a random directory in order for this to work.
	metadataPath: str
		The path of the directory where the metadata is stored. By default, it is the root directory.	
	
	'''

    # This function contains the bulk of the code. The test() function can simply be ignored for now
    # To do: Complete the metadata set. Then the convert_yaml_to_hdf5() function will stop throwing random errors.
    files = [
        f for f in listdir(inputPath)
        if isfile(join(inputPath, f)) and '.csv' in f and '.swp' not in f
    ]
    print(files)
    assert isdir(inputPath)
    #	print(files)
    store = HDFStore(hdfFilename)
    #	fp=pd.read_csv(join(inputPath, sent))
    for i in range(len(files)):
        sent = files[i]
        key = Key(building=1, meter=(i + 1))
        print('Loading file #', (i + 1), '. Please wait...')
        fp = pd.read_csv(join(inputPath, sent))
        fp.TS = fp.TS.astype('int')
        fp.index = pd.to_datetime((fp.TS.values * 1e9).astype(int))
        #		fp=fp.tz_convert('Asia/Kolkata')
        fp = fp.drop('TS', 1)
        fp.rename(columns=lambda x: columnNameMapping[x], inplace=True)
        fp.columns.set_names(LEVEL_NAMES, inplace=True)
        fp = fp.convert_objects(convert_numeric=True)
        fp = fp.dropna()
        fp = fp.astype(np.float32)
        store.put(str(key), fp, format='Table')
        store.flush()
        print("Done with file #", (i + 1))
    store.close()
Example #17
0
def convert_combed(combed_path, hdf_filename):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    assert isdir(combed_path)

    # Open HDF5 file
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')
    chan = 1
    for building, meter_array in SUBMETER_PATHS.iteritems():
        for meter in meter_array:
            key = Key(building=1, meter=chan)
            dfs = []
            total = pd.DataFrame()
            for attribute in column_mapping.keys():
                filename_attribute = join(combed_path, building, str(meter),
                                          "%s.csv" % attribute)
                print(filename_attribute)
                dfs.append(
                    pd.read_csv(filename_attribute,
                                parse_dates=True,
                                index_col=0,
                                header=True,
                                names=[attribute]))
            total = pd.concat(dfs, axis=1)

            total.rename(columns=lambda x: column_mapping[x], inplace=True)
            total.columns.set_names(LEVEL_NAMES, inplace=True)
            store.put(str(key), total, format='table')
            store.flush()
            chan = chan + 1
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         hdf_filename)

    print("Done converting COMBED to HDF5!")
Example #18
0
def _convert(connection, dest_file, start_date, end_date, tz, sort_index=True):
    """
    Parameters
    ----------
    connection: Connection
        Connection to the DEDDIAG database
    dest_file : DataStore
        The NILMTK DataStore object
    tz : str
        Timezone e.g. 'Europe/Berlin'
    sort_index : bool
        Defaults to True
    """

    print(f"Loading house {house_nr}", end="... ")
    stdout.flush()

    # Find all houses and channels
    for channel in channels:
        print(f"{channel}", end=" ")
        stdout.flush()

        measurements = MeasurementsExpanded(channel, start_date,
                                            end_date).request(connection)
        measurements.drop(columns='item_id', inplace=True)
        measurements['time'] = pd.to_datetime(measurements['time'],
                                              utc=True,
                                              unit='s')
        measurements.set_index('time', inplace=True)
        # set index und columns as LEVEL_NAMES
        measurements = measurements.tz_convert(tz)
        measurements.columns = pd.MultiIndex.from_arrays(
            measurements_conf,
            names=LEVEL_NAMES)  # measurements_conf = [['power'], ['active']]

        if sort_index:
            measurements.sort_index(inplace=True)

        key = Key(building=house_nr, meter=channel)
        # write data
        dest_file.put(str(key), measurements)
Example #19
0
def convert_iawe(iawe_path, hdf_filename):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    assert isdir(iawe_path)

    # Open HDF5 file
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')

    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 13):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename)
        df.index = pd.to_datetime((df.timestamp.values * 1E9).astype(int),
                                  utc=True)
        df = df.tz_convert('Asia/Kolkata')
        df = df.drop('timestamp', 1)
        df.rename(columns=lambda x: column_mapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.convert_objects(convert_numeric=True)
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        store.put(str(key), df, format='table')
        store.flush()
    store.close()
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         hdf_filename)

    print("Done converting iAWE to HDF5!")
Example #20
0
def _convert(input_path, hdf_filename, measurement_mapping_func, tz):
    """
    Parameters
    ----------
    input_path : str
        The root path of the REDD low_freq dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    measurement_mapping_func : function
        Must take these parameters:
            - house_id
            - chan_id
        Function should return a list of tuples e.g. [('power', 'active')]
    tz : str 
        Timezone e.g. 'US/Eastern'
    """

    assert isdir(input_path)

    # Open HDF5 file
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')

    # Iterate though all houses and channels
    houses = _find_all_houses(input_path)
    for house_id in houses:
        print("Loading house", house_id, end="... ")
        stdout.flush()
        chans = _find_all_chans(input_path, house_id)
        for chan_id in chans:
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=house_id, meter=chan_id)
            measurements = measurement_mapping_func(house_id, chan_id)
            df = _load_chan(input_path, key, measurements, tz)
            store.put(str(key), df, format='table')
            store.flush()
        print()

    store.close()
Example #21
0
def convert_redd(redd_path, hdf_filename):
    """
    Parameters
    ----------
    redd_path : str
        The root path of the REDD low_freq dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    assert isdir(redd_path)

    # Open HDF5 file
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')

    # Iterate though all houses and channels
    houses = _find_all_houses(redd_path)
    for house_id in houses:
        print("Loading house", house_id, end="... ")
        stdout.flush()
        chans = _find_all_chans(redd_path, house_id)
        for chan_id in chans:
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=house_id, meter=chan_id)
            ac_type = 'apparent' if chan_id <= 2 else 'active'
            df = _load_chan(redd_path, key, [('power', ac_type)])
            store.put(str(key), df, format='table')
            store.flush()
        print()

    store.close()

    # Add metadata
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         hdf_filename)

    print("Done converting REDD to HDF5!")
Example #22
0
def convert_iawe(iawe_path, output_filename, format="HDF"):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    output_filename : str
        The destination filename (including path and suffix).
    """

    check_directory_exists(iawe_path)

    # Open data store
    store = get_datastore(output_filename, format, mode='w')
    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 13):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename)
        df.drop_duplicates(subset=["timestamp"], inplace=True)
        df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True)
        df = df.tz_convert(TIMEZONE)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df.rename(columns=lambda x: column_mapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.convert_objects(convert_numeric=True)
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        store.put(str(key), df)
    store.close()
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         output_filename)

    print("Done converting iAWE to HDF5!")
Example #23
0
def _convert(input_path, store, measurement_mapping_func, tz, sort_index=True):
    """
    Parameters
    ----------
    input_path : str
        The root path of the REDD low_freq dataset.
    store : DataStore
        The NILMTK DataStore object.
    measurement_mapping_func : function
        Must take these parameters:
            - house_id
            - chan_id
        Function should return a list of tuples e.g. [('power', 'active')]
    tz : str 
        Timezone e.g. 'US/Eastern'
    sort_index : bool
    """

    check_directory_exists(input_path)

    # Iterate though all houses and channels
    houses = _find_all_houses(input_path)
    for house_id in houses:
        print("Loading house", house_id, end="... ")
        stdout.flush()
        chans = _find_all_chans(input_path, house_id)
        for chan_id in chans:
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=house_id, meter=chan_id)
            measurements = measurement_mapping_func(house_id, chan_id)
            csv_filename = _get_csv_filename(input_path, key)
            df = _load_csv(csv_filename, measurements, tz)

            if sort_index:
                df = df.sort_index()  # raw REDD data isn't always sorted
            store.put(str(key), df)
        print()
Example #24
0
    timeFiller = np.setdiff1d(np.copy(timeStamps), timeFiltered)
    wattsFiller = np.zeros(timeFiller.shape)

    # combine the on and off data
    timeAll = np.append(timeFiller, timeFiltered)
    wattsAll = np.append(wattsFiller, wattsFiltered)

    # format dataframe data structure and save in nilmtk format
    df = pd.DataFrame({('power', 'apparent'): wattsAll}, dtype=float)
    df.index = pd.to_datetime(timeAll,
                              format='%Y-%m-%d %H:%M:%S',
                              exact=False,
                              utc=True)
    df.columns.set_names(LEVEL_NAMES, inplace=True)
    df = df.tz_convert('US/Eastern')
    key = Key(building=1, meter=instance + 1)
    store.put(str(key), df)

## create the metadata files in accordance with nilmtk guidelines

# building metatdata
if not os.path.exists(pJoin(modelDir, 'train')):
    os.makedirs(pJoin(modelDir, 'train'))
f = open(pJoin(modelDir, 'train', 'building1.yaml'), 'w')
f.write('instance: 1\n')
f.write('elec_meters:\n')
for instance, app in enumerate(np.unique(appliances)):
    if instance == 0:
        f.write('  ' + '1: &generic\n')
        f.write('    ' + 'submeter_of: 0\n')
        f.write('    ' + 'device_model: generic\n')
Example #25
0
def convert_gjw(gjw_path, output_filename):
    """
    Parameters
    ----------
    gjw_path : str
        The root path of the gjw dataset.
    output_filename : str
        The destination filename (including path and suffix), will default if not specified
    directory and file structure
    nilm_gjw_data
        building<1>
            elec
                4-POWER_REAL_FINE <date> Dump.csv
                5-POWER_REACTIVE_STANDARD <date> Dump.csv
                ...
        ...
        building<n>
        HDF5
            nilm_gjw_data.hdf5
        metadata
            building1.yaml
            dataset.yaml
            meter_devices.yaml
        other files    
    """
    if gjw_path is None: gjw_path = home_dir
    check_directory_exists(gjw_path)
    os.chdir(gjw_path)
    gjw_path = os.getcwd(
    )  # sort out potential issue with slashes or backslashes
    if output_filename is None:
        output_filename = join(home_dir, 'HDF5', 'nilm_gjw_data.hdf5')
    # Open data store
    print('opening datastore', output_filename)
    store = get_datastore(output_filename, format, mode='w')
    # walk the directory tree from the dataset home directory
    #clear dataframe & add column headers
    df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME, REACTIVE_COLUMN_NAME])
    found = False
    for current_dir, _, files in os.walk(gjw_path):
        #unused second parameter of for dirs_in_current_dir
        if current_dir.find('.git') != -1 or current_dir.find('.ipynb') != -1:
            #print( 'Skipping ', current_dir)
            continue
        print('checking', current_dir)
        m = bld_re.search(current_dir)
        if m:  #The csv files may be further down the tree so this section may be repeated
            building_name = m.group()
            building_nbr = int(bld_nbr_re.search(building_name).group())
            meter_nbr = 1
            key = Key(building=building_nbr, meter=meter_nbr)
        for items in fnmatch.filter(files, "4*.csv"):
            # process any .CSV files found
            found = True
            ds = iso_date_re.search(items).group()
            # print( 'found files for date:', ds,end=" ")
            # found files to process
            df1 = _read_file_pair(current_dir,
                                  ds)  # read two csv files into a dataframe
            df = pd.concat(
                [df, df1])  # concatenate the results into one long dataframe
        if found:
            found = False
            df = _prepare_data_for_toolkit(df)
            _summarise_dataframe(df, 'Prepared for tool kit')
            store.put(str(key), df)
            #clear dataframe & add column headers
            #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME])
            break  # only 1 folder with .csv files at present
    store.close()
    convert_yaml_to_hdf5(join(gjw_path, 'metadata'), output_filename)
    print("Done converting gjw to HDF5!")
Example #26
0
def convert_sortd(input_path, output_filename, format='HDF'):
    """Converts the dataset to NILMTK HDF5 format.

    For more information about the SOR test dataset, contact Samuel Marisa.

    Parameters
    ----------
    input_path : str
        The root path of the dataset.  It is assumed that the YAML
        metadata is in 'input_path/metadata'.
    output_filename : str
        The destination filename (including path and suffix).
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'

    Example usage:
    --------------
    convert('/sortd', 'store.h5')
    """
    print(
        'Attempting to convert the SORTD dataset at %s into %s in NILMTK %s format...'
        % (input_path, output_filename, format))
    # Ensure that the input directory exists
    check_directory_exists(input_path)
    # Load the dataset metadata
    with open(join(input_path, 'metadata/dataset.yaml'), 'r') as stream:
        dataset_metadata = yaml.load(stream)
    # Open the datastore
    store = get_datastore(output_filename, format, mode='w')
    # Iterate through all building metadata files found in the dataset
    for metadata_file in glob.glob(
            join(input_path, 'metadata/building[0-9]*.yaml')):
        # Load the building metadata
        with open(metadata_file, 'r') as stream:
            metadata = yaml.load(stream)
        building_id = int(metadata['instance'])
        print('==> Loading building %d defined at %s. Please wait...' %
              (building_id, metadata_file))
        for meter_id, meter_data in metadata['elec_meters'].items():
            meter_id = int(meter_id)
            key = Key(building=building_id, meter=meter_id)
            # Load the raw data from the data location
            print('  - Loading meter %s from %s...' %
                  (meter_id, meter_data['data_location']))
            columns = [('power', 'active')]
            df = pd.read_csv(join(input_path, meter_data['data_location']),
                             sep=',',
                             names=columns,
                             dtype={m: np.float32
                                    for m in columns})
            # Convert the timestamp index column to timezone-aware datetime
            df.index = pd.to_datetime(df.index.values, unit='s', utc=True)
            df = df.tz_convert(dataset_metadata['timezone'])
            #df = pd.read_csv(join(input_path, db_file), sep=';', names=('Datetime', 'P1', 'P2', 'P3'), dtype={'P1': np.float64, 'P2': np.float64, 'P3': np.float64}, parse_dates=[1])
            print(df.info())
            print(df.head())
            #print(df.tail())
            print("  - Storing data under key %s in the datastore..." %
                  (str(key)))
            store.put(str(key), df)
        print("  - Building %s loaded!" % (building_id))
    print("Adding the metadata into the store...")
    save_yaml_to_datastore(join(input_path, 'metadata'), store)
    print("Closing the store...")
    store.close()
    print("Done converting SORTD dataset to HDF5!")
Example #27
0
def _dataport_dataframe_to_hdf(dataport_dataframe, 
                                 store, 
                                 nilmtk_building_id,
                                 dataport_building_id):
    local_dataframe = dataport_dataframe.copy()
    
    # remove timezone information to avoid append errors
    local_dataframe['localminute'] = pd.DatetimeIndex([i.replace(tzinfo=None) 
                                                       for i in local_dataframe['localminute']])
    
    # set timestamp as frame index
    local_dataframe = local_dataframe.set_index('localminute')
    
    # set timezone
    local_dataframe = local_dataframe.tz_localize('US/Central')
    
    # remove timestamp column from dataframe
    feeds_dataframe = local_dataframe.drop('dataid', axis=1)

    # Column names for dataframe
    column_names = [('power', 'active')]
    
    # convert from kW to W
    feeds_dataframe = feeds_dataframe.mul(1000)
    
    # building metadata
    building_metadata = {}
    building_metadata['instance'] = nilmtk_building_id
    building_metadata['original_name'] = int(dataport_building_id) # use python int
    building_metadata['elec_meters'] = {}
    building_metadata['appliances'] = []
    
    # initialise dict of instances of each appliance type
    instance_counter = {}
    
    meter_id = 1
    for column in feeds_dataframe.columns:
        if feeds_dataframe[column].notnull().sum() > 0 and not column in feed_ignore:

            # convert timeseries into dataframe
            feed_dataframe = pd.DataFrame(feeds_dataframe[column])
            
            # set column names
            feed_dataframe.columns = pd.MultiIndex.from_tuples(column_names)
            
            # Modify the column labels to reflect the power measurements recorded.
            feed_dataframe.columns.set_names(LEVEL_NAMES, inplace=True)
            
            key = Key(building=nilmtk_building_id, meter=meter_id)
            
            # store dataframe
            store.put(str(key), feed_dataframe, format='table', append=True)
            store.flush()
                        
            # elec_meter metadata
            if column == 'use':
                meter_metadata = {'device_model': 'eGauge',
                                  'site_meter': True}
            else:
                meter_metadata = {'device_model': 'eGauge',
                                   'submeter_of': 0}
            building_metadata['elec_meters'][meter_id] = meter_metadata
                
            # appliance metadata
            if column != 'use':
                # original name and meter id
                appliance_metadata = {'original_name': column, 
                                      'meters': [meter_id] }
                # appliance type and room if available
                appliance_metadata.update(feed_mapping[column])
                # appliance instance number
                if instance_counter.get(appliance_metadata['type']) == None:
                    instance_counter[appliance_metadata['type']] = 0
                instance_counter[appliance_metadata['type']] += 1 
                appliance_metadata['instance'] = instance_counter[appliance_metadata['type']]
                
                building_metadata['appliances'].append(appliance_metadata)

            meter_id += 1
            
    # write building yaml to file
    building = 'building{:d}'.format(nilmtk_building_id)
    yaml_full_filename = join(_get_module_directory(), 'metadata', building + '.yaml')
    with open(yaml_full_filename, 'w') as outfile:
        outfile.write(yaml.dump(building_metadata))
        
    return 0
Example #28
0
def _dataport_dataframe_to_hdf(dataport_dataframe, store, nilmtk_building_id,
                               dataport_building_id):
    local_dataframe = dataport_dataframe.copy()

    # WK: using local time zone seems problematic with NotExistenceTimeError and AmbiguousError, let's stick to UTC
    # WK: After thorough examination, I GUESS that dataport will return UTC time if possible. When there is ambiguous
    #     dst time, the returned data would be tz=None. In this case, we should manually convert the 'localminute'
    #     column back to UTC time by handling the ambiguity.
    if pd.DatetimeIndex(local_dataframe['localminute']).tzinfo != pytz.UTC:
        print('NOT UTC encountered, localminute dtype is: %s' %
              local_dataframe['localminute'].dtype)
        local_dataframe['localminute'] = pd.DatetimeIndex(
            local_dataframe['localminute']).tz_localize('UTC')

    # WK: the following line is commented by WK
    # remove timezone information to avoid append errors
    # local_dataframe['localminute'] = pd.DatetimeIndex([i.replace(tzinfo=None)
    #                                                    for i in local_dataframe['localminute']])

    # set timestamp as frame index
    local_dataframe = local_dataframe.set_index('localminute')

    # WK: the following line is commented by WK
    # set timezone
    # local_dataframe = local_dataframe.tz_localize('US/Central', ambiguous='infer')

    # remove timestamp column from dataframe
    feeds_dataframe = local_dataframe.drop('dataid', axis=1)

    # Column names for dataframe
    column_names = [('power', 'active')]

    # convert from kW to W
    feeds_dataframe = feeds_dataframe.mul(1000)

    # building metadata
    building_metadata = {}
    building_metadata['instance'] = nilmtk_building_id
    building_metadata['original_name'] = int(
        dataport_building_id)  # use python int
    building_metadata['elec_meters'] = {}
    building_metadata['appliances'] = []

    # initialise dict of instances of each appliance type
    instance_counter = {}

    meter_id = 1
    for column in feeds_dataframe.columns:
        if feeds_dataframe[column].notnull().sum(
        ) > 0 and not column in feed_ignore:

            # convert timeseries into dataframe
            feed_dataframe = pd.DataFrame(feeds_dataframe[column])

            # set column names
            feed_dataframe.columns = pd.MultiIndex.from_tuples(column_names)

            # Modify the column labels to reflect the power measurements recorded.
            feed_dataframe.columns.set_names(LEVEL_NAMES, inplace=True)

            key = Key(building=nilmtk_building_id, meter=meter_id)

            if feed_dataframe.index.tzinfo != pytz.UTC:
                print('NOT UTC timezone!!! double check!')
            # store dataframe
            store.put(str(key), feed_dataframe, format='table',
                      append=True)  # WK: before store, should check tzinfo!!
            store.flush()

            # elec_meter metadata
            if column == 'use':
                meter_metadata = {'device_model': 'eGauge', 'site_meter': True}
            else:
                meter_metadata = {'device_model': 'eGauge', 'submeter_of': 0}
            building_metadata['elec_meters'][meter_id] = meter_metadata

            # appliance metadata
            if column != 'use':
                # original name and meter id
                appliance_metadata = {
                    'original_name': column,
                    'meters': [meter_id]
                }
                # appliance type and room if available
                appliance_metadata.update(feed_mapping[column])
                # appliance instance number
                if instance_counter.get(appliance_metadata['type']) == None:
                    instance_counter[appliance_metadata['type']] = 0
                instance_counter[appliance_metadata['type']] += 1
                appliance_metadata['instance'] = instance_counter[
                    appliance_metadata['type']]

                building_metadata['appliances'].append(appliance_metadata)

            meter_id += 1

    # write building yaml to file
    building = 'building{:d}'.format(nilmtk_building_id)
    yaml_full_filename = join(_get_module_directory(), 'metadata',
                              building + '.yaml')
    with open(yaml_full_filename, 'w') as outfile:
        outfile.write(yaml.dump(building_metadata))

    return 0
Example #29
0
def convert_greend(greend_path, hdf_filename):
    """
    Parameters
    ----------
    greend_path : str
        The root path of the greend dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')
    houses = sorted(__get_houses(greend_path))
    print(houses)
    h = 1  # nilmtk counts buildings from 1 not from 0 as we do, so everything is shifted by 1
    for house in houses:
        print('loading ' + house)
        abs_house = join(greend_path, house)
        dates = [d for d in listdir(abs_house) if d.startswith('dataset')]
        house_data = []
        for date in dates:
            print('-----------------------', date)
            try:
                tmp_pandas = pd.read_csv(join(abs_house, date),
                                         na_values=['na'],
                                         error_bad_lines=False)
            except:  # A ParserError/ValueError is returned for malformed files (irregular column number)
                pass
                # for building0 either remove the first days (with less nodes) or use __preprocess_file
                #import StringIO as sio
                #tmp_pandas = pd.DataFrame.from_csv(sio.StringIO(__preprocess_file(abs_house, date)))

            # if the timestamp is not correctly parsed then it's an object dtype (string), else a float64
            if tmp_pandas.timestamp.dtype != np.float64:
                tmp_pandas = tmp_pandas[tmp_pandas.timestamp !=
                                        'timestamp']  # remove all error rows
                # use the cleaned column as the index
            tmp_pandas.index = tmp_pandas["timestamp"].apply(
                pd.to_numeric, errors='ignore').values
            tmp_pandas = tmp_pandas.drop(
                'timestamp', 1
            )  # remove timestamp from the columns (it's the index already)
            tmp_pandas = tmp_pandas.astype(
                "float32")  # convert everything back to float32
            # convert the index to datetime
            tmp_pandas.index = pd.to_datetime(tmp_pandas.index, unit='s')
            tmp_pandas = tmp_pandas.tz_localize("UTC").tz_convert("CET")
            tmp_pandas = tmp_pandas.drop_duplicates()
            #tmp_pandas = tmp_pandas.sort_index()
            house_data.append(tmp_pandas)
        overall_df = pd.concat(house_data)
        overall_df = overall_df.drop_duplicates()
        overall_df = overall_df.sort_index()

        m = 1

        for column in overall_df.columns:
            print("meter" + str(m) + ': ' + column)
            key = Key(building=h, meter=m)
            print("Putting into store...")
            store.put(str(key), overall_df[column], format='table')
            m += 1
            print('Flushing store...')
            store.flush()
        h += 1

    store.close()

    # retrieve the dataset metadata in the metadata subfolder
    import inspect
    convert_yaml_to_hdf5(
        dirname(inspect.getfile(convert_greend)) + '/metadata/', hdf_filename)
Example #30
0
def convert_eco(dataset_loc, hdf_filename, timezone):
    """
    Parameters:
    -----------
    dataset_loc: str
        The root directory where the dataset is located.
    hdf_filename: str
        The location where the hdf_filename is present. 
        The directory location has to contain the 
        hdf5file name for the converter to work.
    timezone: str
        specifies the timezone of the dataset.
    """

    # Creating a new HDF File
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc')

    check_directory_exists(dataset_loc)
    directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i]
    directory_list.sort()
    print(directory_list)

    found_any_sm = False
    found_any_plug = False

    # Traversing every folder
    for folder in directory_list:
        if folder[0] == '.' or folder[-3:] == '.h5':
            print('Skipping ', folder)
            continue

        #Building number and meter_flag
        building_no = int(folder[:2])
        meter_flag = None
        if 'sm_csv' in folder:
            meter_flag = 'sm'
        elif 'plugs' in folder:
            meter_flag = 'plugs'
        else:
            print('Skipping folder', folder)
            continue

        print('Computing for folder', folder)

        dir_list = [
            i for i in listdir(join(dataset_loc, folder))
            if isdir(join(dataset_loc, folder, i))
        ]
        dir_list.sort()

        if meter_flag == 'plugs' and len(dir_list) < 3:
            # Try harder to find the subfolders
            folder = join(folder, folder[:2])
            dir_list = [
                i for i in listdir(join(dataset_loc, folder))
                if isdir(join(dataset_loc, folder, i))
            ]

        print('Current dir list:', dir_list)

        for fl in dir_list:
            print('Computing for folder ', fl)

            fl_dir_list = [
                i for i in listdir(join(dataset_loc, folder, fl))
                if '.csv' in i
            ]
            fl_dir_list.sort()

            if meter_flag == 'sm':
                for fi in fl_dir_list:
                    found_any_sm = True
                    df = pd.read_csv(join(dataset_loc, folder, fl, fi),
                                     names=[i for i in range(1, 17)],
                                     dtype=np.float32)
                    # SmartMeter
                    for phase in range(1, 4):
                        key = str(Key(building=building_no, meter=phase))
                        df_phase = df.loc[:, [
                            1 + phase, 5 + phase, 8 + phase, 13 + phase
                        ]]

                        # get reactive power
                        power = df_phase.loc[:, (1 + phase, 13 + phase)].values
                        reactive = power[:, 0] * np.tan(
                            power[:, 1] * np.pi / 180)
                        df_phase['Q'] = reactive

                        df_phase.index = pd.DatetimeIndex(start=fi[:-4],
                                                          freq='s',
                                                          periods=86400,
                                                          tz='GMT')
                        df_phase = df_phase.tz_convert(timezone)

                        sm_column_name = {
                            1 + phase: ('power', 'active'),
                            5 + phase: ('current', ''),
                            8 + phase: ('voltage', ''),
                            13 + phase: ('phase_angle', ''),
                            'Q': ('power', 'reactive'),
                        }
                        df_phase.columns = pd.MultiIndex.from_tuples(
                            sm_column_name[col] for col in df_phase.columns)

                        power_active = df_phase['power', 'active']
                        tmp_before = np.size(power_active)
                        df_phase = df_phase[power_active != -1]
                        power_active = df_phase['power', 'active']
                        tmp_after = np.size(power_active)

                        if tmp_before != tmp_after:
                            print(
                                'Removed missing measurements - Size before: '
                                + str(tmp_before) + ', size after: ' +
                                str(tmp_after))

                        df_phase.columns.set_names(LEVEL_NAMES, inplace=True)
                        if not key in store:
                            store.put(key, df_phase, format='Table')
                        else:
                            store.append(key, df_phase, format='Table')
                            store.flush()
                        print('Building', building_no, ', Meter no.', phase,
                              '=> Done for ', fi[:-4])
            # Plugs werden auch in Meter uebersetzt und dann aber direkt mit Appliances ergaenzt
            else:
                #Meter number to be used in key
                meter_num = int(fl) + 3

                key = str(Key(building=building_no, meter=meter_num))

                current_folder = join(dataset_loc, folder, fl)
                if not fl_dir_list:
                    raise RuntimeError("No CSV file found in " +
                                       current_folder)

                #Getting dataframe for each csv file seperately
                for fi in fl_dir_list:
                    found_any_plug = True
                    df = pd.read_csv(join(current_folder, fi),
                                     names=[1],
                                     dtype=np.float64)
                    df.index = pd.DatetimeIndex(start=fi[:-4].replace(
                        '.', ':'),
                                                freq='s',
                                                periods=86400,
                                                tz='GMT')
                    df.columns = pd.MultiIndex.from_tuples(
                        plugs_column_name.values())
                    df = df.tz_convert(timezone)
                    df.columns.set_names(LEVEL_NAMES, inplace=True)

                    # Check whether measurements removed
                    tmp_before = np.size(df.power.active)
                    df = df[df.power.active != -1]
                    tmp_after = np.size(df.power.active)
                    if (tmp_before != tmp_after):
                        print('Removed missing measurements - Size before: ' +
                              str(tmp_before) + ', size after: ' +
                              str(tmp_after))

                    # If table not present in hdf5, create or else append to existing data
                    if not key in store:
                        store.put(key, df, format='Table')
                        print('Building', building_no, ', Meter no.',
                              meter_num, '=> Done for ', fi[:-4])
                    else:
                        store.append(key, df, format='Table')
                        store.flush()
                        print('Building', building_no, ', Meter no.',
                              meter_num, '=> Done for ', fi[:-4])

    if not found_any_plug or not found_any_sm:
        raise RuntimeError(
            'The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)'
        )

    print("Data storage completed.")
    store.close()

    # Adding the metadata to the HDF5file
    print("Proceeding to Metadata conversion...")
    meta_path = join(get_module_directory(), 'dataset_converters', 'eco',
                     'metadata')
    convert_yaml_to_hdf5(meta_path, hdf_filename)
    print("Completed Metadata conversion.")