Beispiel #1
0
def refresh_gjw_metadata(gjw_path, output_filename):
    """
    Parameters
    ----------
    gjw_path : str
        The root path of the gjw dataset.
    output_filename : str
        The destination filename (including path and suffix), will default if not specified
    directory and file structure
    nilm_gjw_data
        building<1>
            elec
                4-POWER_REAL_FINE <date> Dump.csv
                5-POWER_REACTIVE_STANDARD <date> Dump.csv
                ...
        ...
        building<n>
        HDF5
            nilm_gjw_data.hdf5
        metadata
            building1.yaml
            dataset.yaml
            meter_devices.yaml
        other files    
    """
    if gjw_path is None: gjw_path = home_dir
    check_directory_exists(gjw_path)
    os.chdir(gjw_path)
    gjw_path = os.getcwd(
    )  # sort out potential issue with slashes or backslashes
    if output_filename is None:
        output_filename = join(home_dir, 'HDF5', 'nilm_gjw_data.hdf5')
    convert_yaml_to_hdf5(join(gjw_path, 'metadata'), output_filename)
    print("Done refreshing metadata")
def convert_combed(combed_path, hdf_filename):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open HDF5 file
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')
    chan = 1
    for building, meter_array in SUBMETER_PATHS.iteritems():
        for meter in meter_array:
            key = Key(building=1, meter=chan)
            dfs = []
            total = pd.DataFrame()
            for attribute in column_mapping.keys():
                filename_attribute = join(combed_path, building, str(meter), "%s.csv" %attribute )
                print(filename_attribute)
                dfs.append(pd.read_csv(filename_attribute, parse_dates = True, index_col = 0, header = True, names=[attribute]))
            total = pd.concat(dfs, axis = 1)
                   
            total.rename(columns=lambda x: column_mapping[x], inplace=True)
            total.columns.set_names(LEVEL_NAMES, inplace=True)
            store.put(str(key), total, format='table')
            store.flush()
            chan = chan+ 1
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         hdf_filename)

    print("Done converting COMBED to HDF5!")
def refresh_gjw_metadata(gjw_path, output_filename):
    """
    Parameters
    ----------
    gjw_path : str
        The root path of the gjw dataset.
    output_filename : str
        The destination filename (including path and suffix), will default if not specified
    directory and file structure
    nilm_gjw_data
        building<1>
            elec
                4-POWER_REAL_FINE <date> Dump.csv
                5-POWER_REACTIVE_STANDARD <date> Dump.csv
                ...
        ...
        building<n>
        HDF5
            nilm_gjw_data.hdf5
        metadata
            building1.yaml
            dataset.yaml
            meter_devices.yaml
        other files    
    """
    if gjw_path is None: gjw_path = home_dir
    check_directory_exists(gjw_path)
    os.chdir(gjw_path)
    gjw_path = os.getcwd()  # sort out potential issue with slashes or backslashes
    if output_filename is None:
        output_filename =join(home_dir,'HDF5','nilm_gjw_data.hdf5')
    convert_yaml_to_hdf5(join(gjw_path, 'metadata'),output_filename)
    print("Done refreshing metadata")
Beispiel #4
0
def convert_combed(combed_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    output_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open store
    store = get_datastore(output_filename, format, mode='w')

    for building_name, building_mapping in overall_dataset_mapping.iteritems():
        for load_name, load_mapping in building_mapping.iteritems():
            for load_mapping_path, meter_number in load_mapping.iteritems():
                building_number = building_number_mapping[building_name]
                key = Key(building=building_number, meter=meter_number)
                dfs = []
                for attribute in column_mapping.keys():
                    filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    print(filename_attribute)
                    dfs.append(pd.read_csv(filename_attribute, parse_dates=True, index_col=0, header=True, names=[attribute]))
                total = pd.concat(dfs, axis=1)
                total = total.tz_localize('UTC').tz_convert('Asia/Kolkata')
                total.rename(columns=lambda x: column_mapping[x], inplace=True)
                total.columns.set_names(LEVEL_NAMES, inplace=True)
                assert total.index.is_unique
                store.put(str(key), total)
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         output_filename)

    print("Done converting COMBED to HDF5!")
Beispiel #5
0
def convert_ampds(input_path, output_filename, format='HDF'):
    """
    Convert AMPds R2013 as seen on Dataverse. Download the files
    as CSVs and put them in the `input_path` folder for conversion.
    
    Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO
    
    Parameters: 
    -----------
    input_path: str
            The path of the directory where all the csv 
            files are supposed to be stored
    output_filename: str
            The path of the h5 file where all the 
            standardized data is supposed to go. The path 
            should refer to a particular file and not just a
             random directory in order for this to work.
    format: str
        Defaults to HDF5
    Example usage:
    --------------
    convert('/AMPds/electricity', 'store.h5')    

    """
    check_directory_exists(input_path)
    files = [
        f for f in listdir(input_path)
        if isfile(join(input_path, f)) and '.csv' in f and '.swp' not in f
    ]
    # Sorting Lexicographically
    files.sort()

    # Remove Whole Home and put it at top
    files.remove("WHE.csv")
    files.insert(0, "WHE.csv")
    assert isdir(input_path)
    store = get_datastore(output_filename, format, mode='w')
    for i, csv_file in enumerate(files):
        key = Key(building=1, meter=(i + 1))
        print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...')
        df = pd.read_csv(join(input_path, csv_file))
        # Due to fixed width, column names have spaces :(
        df.columns = [x.replace(" ", "") for x in df.columns]
        df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME],
                                  unit='s',
                                  utc=True)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df = df.tz_convert(TIMEZONE)
        df.rename(columns=lambda x: columnNameMapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        store.put(str(key), df)
        print("Done with file #", (i + 1))

    store.close()
    metadata_path = join(_get_module_directory(), 'metadata')
    print('Processing metadata...')
    convert_yaml_to_hdf5(metadata_path, output_filename)
Beispiel #6
0
def _convert(input_path,
             data_store,
             measurement_mapping_func,
             sort_index=True,
             drop_duplicates=False):
    meter_to_machine = {
        1: "MainTerminal",
        2: "ChipPress",
        3: "ChipSaw",
        4: "HighTemperatureOven",
        5: "PickAndPlaceUnit",
        6: "ScreenPrinter",
        7: "SolderingOven",
        8: "VacuumOven",
        9: "VacuumPump1",
        10: "VacuumPump2",
        11: "WashingMachine",
    }

    check_directory_exists(input_path)

    print("Loading factory 1...", end="... ")
    chans = _find_all_channels(input_path, meter_to_machine)
    for chan_id, filename in chans.items():
        print(chan_id, end=" ")
        stdout.flush()
        key = Key(building=1, meter=chan_id)
        measurements = measurement_mapping_func(chan_id)
        df = _load_csv(filename,
                       measurements,
                       sort_index=sort_index,
                       drop_duplicates=drop_duplicates)

        data_store.put(str(key), df)
    print()
Beispiel #7
0
def test_all_datasets(directory):
    print("Testing all data sets started at:  {}".format(time.now()))
    print("-"*60)
    check_directory_exists(directory)
    datasets = [f for f in listdir(directory) if isfile(join(directory, f)) and
         '.h5' in f and '.swp' not in f]
    for dataset in datasets:
        test_single_dataset(dataset)
Beispiel #8
0
def convert_combed(combed_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    output_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open store
    store = get_datastore(output_filename, format, mode='w')

    any_file_converted = False
    
    for building_name, building_mapping in iteritems(overall_dataset_mapping):
        for load_name, load_mapping in iteritems(building_mapping):
            for load_mapping_path, meter_number in iteritems(load_mapping):
                building_number = building_number_mapping[building_name]
                key = Key(building=building_number, meter=meter_number)
                dfs = []
                for attribute in column_mapping.keys():
                    filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    if not os.path.isfile(filename_attribute):
                        # File not found directly in the combed_path provided
                        # Try adding 'iiitd' to it
                        filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    
                    if os.path.isfile(filename_attribute):
                        exists = True
                        print(filename_attribute)
                        df = pd.read_csv(filename_attribute, names=["timestamp", attribute])
                        df.index = pd.to_datetime(df["timestamp"], unit='ms')
                        df = df.drop("timestamp", 1)
                        dfs.append(df)
                    else:
                        exists = False
                        
                if exists:
                    total = pd.concat(dfs, axis=1)
                    total = total.tz_localize('UTC').tz_convert('Asia/Kolkata')
                    total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns])
                    total.columns.set_names(LEVEL_NAMES, inplace=True)
                    assert total.index.is_unique
                    store.put(str(key), total)
                    any_file_converted = True
                    
    if not any_file_converted:
        raise RuntimeError('No files converted, did you specify the correct path?')
                    
    convert_yaml_to_hdf5(
        join(get_module_directory(), 'dataset_converters', 'combed', 'metadata'),
        output_filename
    )

    print("Done converting COMBED to HDF5!")
Beispiel #9
0
def _convert(input_path, store, tz, sort_index=True):
    """
    Parameters
    ----------
    input_path : str
        The root path of the REFIT dataset.
    store : DataStore
        The NILMTK DataStore object.
    measurement_mapping_func : function
        Must take these parameters:
            - house_id
            - chan_id
        Function should return a list of tuples e.g. [('power', 'active')]
    tz : str 
        Timezone e.g. 'US/Eastern'
    sort_index : bool
    """

    check_directory_exists(input_path)

    # Iterate though all houses and channels
    # house 14 is missing!
    houses = [
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21
    ]
    nilmtk_house_id = 0
    for house_id in houses:
        nilmtk_house_id += 1
        print("Loading house", house_id, end="... ")
        stdout.flush()
        csv_filename = join(input_path, 'House_' + str(house_id) + '.csv')
        # The clean version already includes header, so we
        # just skip the text version of the timestamp
        usecols = [
            'Unix', 'Aggregate', 'Appliance1', 'Appliance2', 'Appliance3',
            'Appliance4', 'Appliance5', 'Appliance6', 'Appliance7',
            'Appliance8', 'Appliance9'
        ]

        df = _load_csv(csv_filename, usecols, tz)
        if sort_index:
            df = df.sort_index()  # might not be sorted...
        chan_id = 0
        for col in df.columns:
            chan_id += 1
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=nilmtk_house_id, meter=chan_id)

            chan_df = pd.DataFrame(df[col])
            chan_df.columns = pd.MultiIndex.from_tuples([('power', 'active')])

            # Modify the column labels to reflect the power measurements recorded.
            chan_df.columns.set_names(LEVEL_NAMES, inplace=True)

            store.put(str(key), chan_df)
        print('')
Beispiel #10
0
def convert_ampds(input_path, output_filename, format='HDF'):
    """
    Convert AMPds R2013 as seen on Dataverse. Download the files
    as CSVs and put them in the `input_path` folder for conversion.
    
    Download URL: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MXB7VO
    
    Parameters: 
    -----------
    input_path: str
            The path of the directory where all the csv 
            files are supposed to be stored
    output_filename: str
            The path of the h5 file where all the 
            standardized data is supposed to go. The path 
            should refer to a particular file and not just a
             random directory in order for this to work.
    format: str
        Defaults to HDF5
    Example usage:
    --------------
    convert('/AMPds/electricity', 'store.h5')    

    """
    check_directory_exists(input_path)
    files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and
             '.csv' in f and '.swp' not in f]
    # Sorting Lexicographically
    files.sort()

    # Remove Whole Home and put it at top
    files.remove("WHE.csv")
    files.insert(0, "WHE.csv")
    assert isdir(input_path)
    store = get_datastore(output_filename, format, mode='w')
    for i, csv_file in enumerate(files):
        key = Key(building=1, meter=(i + 1))
        print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...')
        df = pd.read_csv(join(input_path, csv_file))
        # Due to fixed width, column names have spaces :(
        df.columns = [x.replace(" ", "") for x in df.columns]
        df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit='s', utc=True)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df = df.tz_convert(TIMEZONE)
        df.rename(columns=lambda x: columnNameMapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        store.put(str(key), df)
        print("Done with file #", (i + 1))
        
    store.close()
    metadata_path = join(get_module_directory(), 'dataset_converters', 'ampds', 'metadata')
    print('Processing metadata...')
    convert_yaml_to_hdf5(metadata_path, output_filename)
Beispiel #11
0
def convert_combed(combed_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    output_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open store
    store = get_datastore(output_filename, format, mode='w')

    any_file_converted = False
    
    for building_name, building_mapping in iteritems(overall_dataset_mapping):
        for load_name, load_mapping in iteritems(building_mapping):
            for load_mapping_path, meter_number in iteritems(load_mapping):
                building_number = building_number_mapping[building_name]
                key = Key(building=building_number, meter=meter_number)
                dfs = []
                for attribute in column_mapping.keys():
                    filename_attribute = join(combed_path, building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    if not os.path.isfile(filename_attribute):
                        # File not found directly in the combed_path provided
                        # Try adding 'iiitd' to it
                        filename_attribute = join(combed_path, 'iiitd', building_name, load_name, load_mapping_path, "%s.csv" %attribute)
                    
                    if os.path.isfile(filename_attribute):
                        exists = True
                        print(filename_attribute)
                        df = pd.read_csv(filename_attribute, names=["timestamp", attribute])
                        df.index = pd.to_datetime(df["timestamp"], unit='ms')
                        df = df.drop("timestamp", 1)
                        dfs.append(df)
                    else:
                        exists = False
                        
                if exists:
                    total = pd.concat(dfs, axis=1)
                    total = total.tz_localize('UTC').tz_convert('Asia/Kolkata')
                    total.columns = pd.MultiIndex.from_tuples([column_mapping[x] for x in total.columns])
                    total.columns.set_names(LEVEL_NAMES, inplace=True)
                    assert total.index.is_unique
                    store.put(str(key), total)
                    any_file_converted = True
                    
    if not any_file_converted:
        raise RuntimeError('No files converted, did you specify the correct path?')
                    
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         output_filename)

    print("Done converting COMBED to HDF5!")
Beispiel #12
0
def _convert(input_path, store, tz, sort_index=True):
    """
    Parameters
    ----------
    input_path : str
        The root path of the REFIT dataset.
    store : DataStore
        The NILMTK DataStore object.
    measurement_mapping_func : function
        Must take these parameters:
            - house_id
            - chan_id
        Function should return a list of tuples e.g. [('power', 'active')]
    tz : str 
        Timezone e.g. 'US/Eastern'
    sort_index : bool
    """

    check_directory_exists(input_path)

    # Iterate though all houses and channels
    # house 14 is missing!
    houses = [1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,21]
    nilmtk_house_id = 0
    for house_id in houses:
        nilmtk_house_id += 1
        print("Loading house", house_id, end="... ")
        stdout.flush()
        csv_filename = join(input_path, 'House_' + str(house_id) + '.csv')
        # The clean version already includes header, so we
        # just skip the text version of the timestamp
        usecols = ['Unix','Aggregate','Appliance1','Appliance2','Appliance3','Appliance4','Appliance5','Appliance6','Appliance7','Appliance8','Appliance9']
        
        df = _load_csv(csv_filename, usecols, tz)
        if sort_index:
            df = df.sort_index() # might not be sorted...
        chan_id = 0
        for col in df.columns:
            chan_id += 1
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=nilmtk_house_id, meter=chan_id)
            
            chan_df = pd.DataFrame(df[col])
            chan_df.columns = pd.MultiIndex.from_tuples([('power', 'active')])
            
            # Modify the column labels to reflect the power measurements recorded.
            chan_df.columns.set_names(LEVEL_NAMES, inplace=True)
            
            store.put(str(key), chan_df)
        print('')
def _convert(input_path,
             store,
             measurement_mapping_func,
             tz,
             sort_index=True,
             drop_duplicates=False):
    """
    Parameters
    ----------
    input_path : str
        The root path of the DEPS dataset.
    store : DataStore
        The NILMTK DataStore object.
    measurement_mapping_func : function
        Must take these parameters:
            - classroom_id
            - chan_id
        Function should return a list of tuples e.g. [('power', 'active')]
    tz : str 
        Timezone e.g. 'US/Eastern'
    sort_index : bool
        Defaults to True
    drop_duplicates : bool
        Remove entries with duplicated timestamp (keeps the first value)
        Defaults to False for backwards compatibility.
    """

    check_directory_exists(input_path)

    # Iterate though all classrooms and channels
    classrooms = _find_all_classrooms(input_path)
    for classroom_id in classrooms:
        print("Loading data from 'Aula 2.2 Bis' to classroom N°",
              classroom_id,
              end=" ... Loading channels ")
        stdout.flush()
        chans = _find_all_chans(input_path, classroom_id)
        for chan_id in chans:
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=classroom_id, meter=chan_id)
            measurements = measurement_mapping_func(classroom_id, chan_id)
            csv_filename = _get_csv_filename(input_path, key)
            df = _load_csv(csv_filename,
                           measurements,
                           tz,
                           sort_index=sort_index,
                           drop_duplicates=drop_duplicates)
            store.put(str(key), df)
        print()
Beispiel #14
0
def convert_ampds(inputPath, hdfFilename):
    '''
    Parameters: 
    -----------
    inputPath: str
            The path of the directory where all the csv 
            files are supposed to be stored
    hdfFilename: str
            The path of the h5 file where all the 
            standardized data is supposed to go. The path 
            should refer to a particular file and not just a
             random directory in order for this to work.
    Example usage:
    --------------
    convert('/AMPds/electricity', 'store.h5')    

    '''
    check_directory_exists(inputPath)
    files = [f for f in listdir(inputPath) if isfile(join(inputPath, f)) and 
             '.csv' in f and '.swp' not in f]
    # Sorting Lexicographically
    files.sort()
    print(files)

    # Remove Whole Home and put it at top
    files.remove("WHE.csv")
    files.insert(0, "WHE.csv")
    assert isdir(inputPath)
    store = HDFStore(hdfFilename)
    for i, csv_file in enumerate(files):  
        key = Key(building=1, meter=(i + 1))
        print('Loading file #', (i + 1), ' : ', csv_file, '. Please wait...')
        df = pd.read_csv(join(inputPath, csv_file))
        # Due to fixed width, column names have spaces :(
        df.columns = [x.replace(" ", "") for x in df.columns]
        df.index = pd.to_datetime(df["TIMESTAMP"], unit='s', utc = True)
        df = df.drop('TIMESTAMP', 1)
        df = df.tz_localize('GMT').tz_convert('America/Vancouver')
        df.rename(columns=lambda x: columnNameMapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.convert_objects(convert_numeric=True)
        df = df.dropna()
        df = df.astype(np.float32)
        store.put(str(key), df, format='Table')
        store.flush()
        print("Done with file #", (i + 1))
    store.close()
    metadataPath = join(_get_module_directory(), 'metadata')
    print('Processing metadata...')
    convert_yaml_to_hdf5(metadataPath, hdfFilename)
Beispiel #15
0
def convert_ampds(input_path, output_filename, format="HDF"):
    """
    Parameters: 
    -----------
    input_path: str
            The path of the directory where all the csv 
            files are supposed to be stored
    output_filename: str
            The path of the h5 file where all the 
            standardized data is supposed to go. The path 
            should refer to a particular file and not just a
             random directory in order for this to work.
    format: str
        Defaults to HDF5
    Example usage:
    --------------
    convert('/AMPds/electricity', 'store.h5')    

    """
    check_directory_exists(input_path)
    files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and ".csv" in f and ".swp" not in f]
    # Sorting Lexicographically
    files.sort()

    # Remove Whole Home and put it at top
    files.remove("WHE.csv")
    files.insert(0, "WHE.csv")
    assert isdir(input_path)
    store = get_datastore(output_filename, format, mode="w")
    for i, csv_file in enumerate(files):
        key = Key(building=1, meter=(i + 1))
        print("Loading file #", (i + 1), " : ", csv_file, ". Please wait...")
        df = pd.read_csv(join(input_path, csv_file))
        # Due to fixed width, column names have spaces :(
        df.columns = [x.replace(" ", "") for x in df.columns]
        df.index = pd.to_datetime(df[TIMESTAMP_COLUMN_NAME], unit="s", utc=True)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df = df.tz_localize("GMT").tz_convert(TIMEZONE)
        df.rename(columns=lambda x: columnNameMapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.convert_objects(convert_numeric=True)
        df = df.dropna()
        df = df.astype(np.float32)
        store.put(str(key), df)
        print("Done with file #", (i + 1))
    store.close()
    metadata_path = join(_get_module_directory(), "metadata")
    print("Processing metadata...")
    convert_yaml_to_hdf5(metadata_path, output_filename)
Beispiel #16
0
def convert_combed(combed_path, output_filename, format='HDF'):
    """
    Parameters
    ----------
    combed_path : str
        The root path of the combed dataset.
    output_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(combed_path)

    # Open store
    store = get_datastore(output_filename, format, mode='w')

    for building_name, building_mapping in iteritems(overall_dataset_mapping):
        for load_name, load_mapping in iteritems(building_mapping):
            for load_mapping_path, meter_number in iteritems(load_mapping):
                building_number = building_number_mapping[building_name]
                key = Key(building=building_number, meter=meter_number)
                dfs = []
                for attribute in column_mapping.keys():
                    filename_attribute = join(combed_path, building_name,
                                              load_name, load_mapping_path,
                                              "%s.csv" % attribute)
                    if os.path.isfile(filename_attribute):
                        exists = True
                        print(filename_attribute)
                        df = pd.read_csv(filename_attribute,
                                         header=True,
                                         names=["timestamp", attribute])
                        df.index = pd.to_datetime(df["timestamp"], unit='ms')
                        df = df.drop("timestamp", 1)
                        dfs.append(df)
                    else:
                        exists = False
                if exists:
                    total = pd.concat(dfs, axis=1)
                    total = total.tz_localize('UTC').tz_convert('Asia/Kolkata')
                    total.rename(columns=lambda x: column_mapping[x],
                                 inplace=True)
                    total.columns.set_names(LEVEL_NAMES, inplace=True)
                    assert total.index.is_unique
                    store.put(str(key), total)
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         output_filename)

    print("Done converting COMBED to HDF5!")
Beispiel #17
0
def convert_iawe(iawe_path, output_filename, format="HDF"):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    output_filename : str
        The destination filename (including path and suffix).
    """

    check_directory_exists(iawe_path)
    idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ)
    idx = idx.tz_localize('GMT').tz_convert(TIMEZONE)

    # Open data store
    store = get_datastore(output_filename, format, mode='w')
    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 12):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename, dtype=np.float64, na_values='\\N')
        df.drop_duplicates(subset=["timestamp"], inplace=True)
        df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True)
        df = df.tz_convert(TIMEZONE)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df.columns = pd.MultiIndex.from_tuples(
            [column_mapping[x] for x in df.columns], names=LEVEL_NAMES)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        df = df.resample("1T").mean()
        df = reindex_fill_na(df, idx)
        assert df.isnull().sum().sum() == 0
        store.put(str(key), df)
    store.close()

    metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe',
                        'metadata')
    convert_yaml_to_hdf5(metadata_dir, output_filename)

    print("Done converting iAWE to HDF5!")
Beispiel #18
0
def convert_iawe(iawe_path, output_filename, format="HDF"):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    output_filename : str
        The destination filename (including path and suffix).
    """

    check_directory_exists(iawe_path)
    idx = pd.DatetimeIndex(start=START_DATETIME, end=END_DATETIME, freq=FREQ)
    idx = idx.tz_localize('GMT').tz_convert(TIMEZONE)

    # Open data store
    store = get_datastore(output_filename, format, mode='w')
    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 12):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename, dtype=np.float64, na_values='\\N')
        df.drop_duplicates(subset=["timestamp"], inplace=True)
        df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True)
        df = df.tz_convert(TIMEZONE)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df.rename(columns=lambda x: column_mapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.apply(pd.to_numeric, errors='ignore')
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        df = df.resample("1T").mean()
        df = reindex_fill_na(df, idx)
        assert df.isnull().sum().sum() == 0
        store.put(str(key), df)
    store.close()
    
    metadata_dir = join(get_module_directory(), 'dataset_converters', 'iawe', 'metadata')
    convert_yaml_to_hdf5(metadata_dir, output_filename)

    print("Done converting iAWE to HDF5!")
Beispiel #19
0
def _convert_to_datastore(input_path, store, tz):
    check_directory_exists(input_path)
    homes = _get_all_homes(input_path)
    for home in homes:
        home_id = int(re.search("home_([\d]*).csv", home).group(1))
        csv_filename = join(input_path, home)
        dtype_dict = {m: np.float32 for m in MAJOR_LOAD}
        dtype_dict[TIME_INDEX] = pd.datetime
        whole_df = pd.read_csv(csv_filename, index_col=TIME_INDEX, dtype=dtype_dict)
        del whole_df.index.name
        print ("processing ", home_id, end="... ")
        for meter in MAJOR_LOAD:
            meter_id = int(MAJOR_LOAD.index(meter))+1
            table_key = Key(building=home_id, meter=meter_id)
            table_df = _load_csv(whole_df, meter, tz)
            table_df.sort_index()
            store.put(str(table_key), table_df)
            print (meter, end=" ")
        print ("finished", end="!")
        print ()
Beispiel #20
0
def _convert(input_path, hdf_filename, measurement_mapping_func, tz):
    """
    Parameters
    ----------
    input_path : str
        The root path of the REDD low_freq dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    measurement_mapping_func : function
        Must take these parameters:
            - house_id
            - chan_id
        Function should return a list of tuples e.g. [('power', 'active')]
    tz : str 
        Timezone e.g. 'US/Eastern'
    """

    check_directory_exists(input_path)

    # Open HDF5 file
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')

    # Iterate though all houses and channels
    houses = _find_all_houses(input_path)
    for house_id in houses:
        print("Loading house", house_id, end="... ")
        stdout.flush()
        chans = _find_all_chans(input_path, house_id)
        for chan_id in chans:
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=house_id, meter=chan_id)
            measurements = measurement_mapping_func(house_id, chan_id)
            csv_filename = _get_csv_filename(input_path, key)
            df = _load_csv(csv_filename, measurements, tz)
            df = df.sort_index() # raw REDD data isn't always sorted
            store.put(str(key), df, format='table')
            store.flush()
        print()

    store.close()
Beispiel #21
0
def convert_iawe(iawe_path, hdf_filename):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """

    check_directory_exists(iawe_path)

    # Open HDF5 file
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')

    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 13):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename)
        df.index = pd.to_datetime(
            (df.timestamp.values * 1E9).astype(int), utc=True)
        df = df.tz_convert('Asia/Kolkata')
        df = df.drop('timestamp', 1)
        df.rename(columns=lambda x: column_mapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.convert_objects(convert_numeric=True)
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        store.put(str(key), df, format='table')
        store.flush()
    store.close()
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         hdf_filename)

    print("Done converting iAWE to HDF5!")
Beispiel #22
0
def convert_iawe(iawe_path, output_filename, format="HDF"):
    """
    Parameters
    ----------
    iawe_path : str
        The root path of the iawe dataset.
    output_filename : str
        The destination filename (including path and suffix).
    """

    check_directory_exists(iawe_path)

    # Open data store
    store = get_datastore(output_filename, format, mode='w')
    electricity_path = join(iawe_path, "electricity")

    # Mains data
    for chan in range(1, 13):
        key = Key(building=1, meter=chan)
        filename = join(electricity_path, "%d.csv" % chan)
        print('Loading ', chan)
        df = pd.read_csv(filename)
        df.drop_duplicates(subset=["timestamp"], inplace=True)
        df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True)
        df = df.tz_convert(TIMEZONE)
        df = df.drop(TIMESTAMP_COLUMN_NAME, 1)
        df.rename(columns=lambda x: column_mapping[x], inplace=True)
        df.columns.set_names(LEVEL_NAMES, inplace=True)
        df = df.convert_objects(convert_numeric=True)
        df = df.dropna()
        df = df.astype(np.float32)
        df = df.sort_index()
        store.put(str(key), df)
    store.close()
    convert_yaml_to_hdf5(join(_get_module_directory(), 'metadata'),
                         output_filename)

    print("Done converting iAWE to HDF5!")
def _convert(input_path, store, measurement_mapping_func, tz, sort_index=True):
    """
    Parameters
    ----------
    input_path : str
        The root path of the LAB dataset.
    store : DataStore
        The NILMTK DataStore object.
    measurement_mapping_func : function
        Must take these parameters:
            - house_id
            - chan_id
        Function should return a list of tuples e.g. [('power', 'active')]
    tz : str 
        Timezone e.g. 'America/Fortaleza'
    sort_index : bool
    """

    check_directory_exists(input_path)

    # Iterate though all houses and channels
    houses = _find_all_houses(input_path)
    for house_id in houses:
        print("Loading house", house_id, end="... ")
        stdout.flush()
        chans = _find_all_chans(input_path, house_id)
        for chan_id in chans:
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=house_id, meter=chan_id)
            measurements = measurement_mapping_func(house_id, chan_id)
            csv_filename = _get_csv_filename(input_path, key)
            df = _load_csv(csv_filename, measurements, tz)

            if sort_index:
                df = df.sort_index() # raw LAB data isn't always sorted
            store.put(str(key), df)
        print()
Beispiel #24
0
def _convert(input_path, store, measurement_mapping_func, tz, sort_index=True):
    """
    Parameters
    ----------
    input_path : str
        The root path of the REDD low_freq dataset.
    store : DataStore
        The NILMTK DataStore object.
    measurement_mapping_func : function
        Must take these parameters:
            - house_id
            - chan_id
        Function should return a list of tuples e.g. [('power', 'active')]
    tz : str 
        Timezone e.g. 'US/Eastern'
    sort_index : bool
    """

    check_directory_exists(input_path)

    # Iterate though all houses and channels
    houses = _find_all_houses(input_path)
    for house_id in houses:
        print("Loading house", house_id, end="... ")
        stdout.flush()
        chans = _find_all_chans(input_path, house_id)
        for chan_id in chans:
            print(chan_id, end=" ")
            stdout.flush()
            key = Key(building=house_id, meter=chan_id)
            measurements = measurement_mapping_func(house_id, chan_id)
            csv_filename = _get_csv_filename(input_path, key)
            df = _load_csv(csv_filename, measurements, tz)

            if sort_index:
                df = df.sort_index()  # raw REDD data isn't always sorted
            store.put(str(key), df)
        print()
Beispiel #25
0
def convert_gjw(gjw_path, output_filename):
    """
    Parameters
    ----------
    gjw_path : str
        The root path of the gjw dataset.
    output_filename : str
        The destination filename (including path and suffix), will default if not specified
    directory and file structure
    nilm_gjw_data
        building<1>
            elec
                4-POWER_REAL_FINE <date> Dump.csv
                5-POWER_REACTIVE_STANDARD <date> Dump.csv
                ...
        ...
        building<n>
        HDF5
            nilm_gjw_data.hdf5
        metadata
            building1.yaml
            dataset.yaml
            meter_devices.yaml
        other files    
    """
    if gjw_path is None: gjw_path = home_dir
    check_directory_exists(gjw_path)
    os.chdir(gjw_path)
    gjw_path = os.getcwd()  # sort out potential issue with slashes or backslashes
    if output_filename is None:
        output_filename =join(home_dir,'HDF5','nilm_gjw_data.hdf5')
    # Open data store
    print( 'opening datastore', output_filename)
    store = get_datastore(output_filename, format, mode='w')
    # walk the directory tree from the dataset home directory
    #clear dataframe & add column headers
    df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME])
    found = False
    for current_dir, _, files in os.walk(gjw_path):
        #unused second parameter of for dirs_in_current_dir
        if current_dir.find('.git')!=-1 or current_dir.find('.ipynb') != -1:
            #print( 'Skipping ', current_dir)
            continue
        print( 'checking', current_dir)
        m = bld_re.search(current_dir)
        if m: #The csv files may be further down the tree so this section may be repeated
            building_name = m.group()
            building_nbr = int(bld_nbr_re.search(building_name).group())
            meter_nbr = 1
            key = Key(building=building_nbr, meter=meter_nbr)
        for items in fnmatch.filter(files, "4*.csv"):
            # process any .CSV files found
            found = True
            ds = iso_date_re.search(items).group()
            # print( 'found files for date:', ds,end=" ")
            # found files to process
            df1 = _read_file_pair(current_dir,ds) # read two csv files into a dataframe    
            df = pd.concat([df,df1]) # concatenate the results into one long dataframe
        if found:
            found = False
            df = _prepare_data_for_toolkit(df)
            _summarise_dataframe(df,'Prepared for tool kit')
            store.put(str(key), df)
            #clear dataframe & add column headers
            #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME])
            break # only 1 folder with .csv files at present
    store.close()
    convert_yaml_to_hdf5(join(gjw_path, 'metadata'),output_filename)
    print("Done converting gjw to HDF5!")
Beispiel #26
0
def convert_eco(dataset_loc, hdf_filename, timezone):
    """
    Parameters:
    -----------
    dataset_loc: str
        The root directory where the dataset is located.
    hdf_filename: str
        The location where the hdf_filename is present. 
        The directory location has to contain the 
        hdf5file name for the converter to work.
    timezone: str
        specifies the timezone of the dataset.
    """

    # Creating a new HDF File
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc')    
    
    check_directory_exists(dataset_loc)
    directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i]
    directory_list.sort()
    print(directory_list)

    found_any_sm = False
    found_any_plug = False
    
    # Traversing every folder
    for folder in directory_list:
        if folder[0] == '.' or folder[-3:] == '.h5':
            print('Skipping ', folder)
            continue

        #Building number and meter_flag
        building_no = int(folder[:2])
        meter_flag = None 
        if 'sm_csv' in folder:
            meter_flag = 'sm'
        elif 'plugs' in folder:
            meter_flag = 'plugs'
        else:
            print('Skipping folder', folder)
            continue
            
        print('Computing for folder', folder)

        dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))]
        dir_list.sort()
        
        if meter_flag == 'plugs' and len(dir_list) < 3:
            # Try harder to find the subfolders
            folder = join(folder, folder[:2])
            dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))]
        
        print('Current dir list:', dir_list)

        for fl in dir_list:
            print('Computing for folder ', fl)
            
            fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i]
            fl_dir_list.sort()

            if meter_flag == 'sm':
                for fi in fl_dir_list:
                    found_any_sm = True
                    df = pd.read_csv(join(dataset_loc,folder,fl,fi), names=[i for i in range(1,17)], dtype=np.float32)
                    
                    for phase in range(1,4):
                        key = str(Key(building=building_no, meter=phase))
                        df_phase = df.loc[:,[1+phase, 5+phase, 8+phase, 13+phase]]

                        # get reactive power
                        power = df_phase.loc[:, (1+phase, 13+phase)].values
                        reactive = power[:,0] * np.tan(power[:,1] * np.pi / 180)
                        df_phase['Q'] = reactive
                        
                        df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT')
                        df_phase = df_phase.tz_convert(timezone)
                        
                        sm_column_name = {
                            1+phase:('power', 'active'),
                            5+phase:('current', ''),
                            8+phase:('voltage', ''),
                            13+phase:('phase_angle', ''),
                            'Q': ('power', 'reactive'),
                        }
                        df_phase.columns = pd.MultiIndex.from_tuples([
                            sm_column_name[col] for col in df_phase.columns
                        ])
                        
                        power_active = df_phase['power', 'active']
                        tmp_before = np.size(power_active)
                        df_phase = df_phase[power_active != -1]
                        power_active = df_phase['power', 'active']
                        tmp_after = np.size(power_active)
                        
                        if tmp_before != tmp_after:
                            print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after))
                        
                        df_phase.columns.set_names(LEVEL_NAMES, inplace=True)
                        if not key in store:
                            store.put(key, df_phase, format='Table')
                        else:
                            store.append(key, df_phase, format='Table')
                            store.flush()
                        print('Building', building_no, ', Meter no.', phase,
                              '=> Done for ', fi[:-4])
                
            else:
                #Meter number to be used in key
                meter_num = int(fl) + 3
                
                key = str(Key(building=building_no, meter=meter_num))

                current_folder = join(dataset_loc,folder,fl)
                if not fl_dir_list:
                    raise RuntimeError("No CSV file found in " + current_folder)
                    
                #Getting dataframe for each csv file seperately
                for fi in fl_dir_list:
                    found_any_plug = True
                    df = pd.read_csv(join(current_folder, fi), names=[1], dtype=np.float64)
                    df.index = pd.DatetimeIndex(start=fi[:-4].replace('.', ':'), freq='s', periods=86400, tz = 'GMT')
                    df.columns = pd.MultiIndex.from_tuples(plugs_column_name.values())
                    df = df.tz_convert(timezone)
                    df.columns.set_names(LEVEL_NAMES, inplace=True)

                    tmp_before = np.size(df.power.active)
                    df = df[df.power.active != -1]
                    tmp_after = np.size(df.power.active)
                    if (tmp_before != tmp_after):
                        print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after))
                    
                    # If table not present in hdf5, create or else append to existing data
                    if not key in store:
                        store.put(key, df, format='Table')
                        print('Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4])
                    else:
                        store.append(key, df, format='Table')
                        store.flush()
                        print('Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4])
            
            
    if not found_any_plug or not found_any_sm:
        raise RuntimeError('The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)')
        
    print("Data storage completed.")
    store.close()

    # Adding the metadata to the HDF5file
    print("Proceeding to Metadata conversion...")
    meta_path = join(
        get_module_directory(), 
        'dataset_converters',
        'eco',
        'metadata'
    )
    convert_yaml_to_hdf5(meta_path, hdf_filename)
    print("Completed Metadata conversion.")
def test_all(path_to_directory):
    '''
    path_to_directory: Contains the h5 files on which the tests are supposed to be run
    '''

    check_directory_exists(path_to_directory)

#files=[f for f in listdir(path_to_directory) and '.h5' in f and '.swp' not in f]
    files = [f for f in listdir(path_to_directory) if isfile(join(path_to_directory, f)) and
         '.h5' in f and '.swp' not in f]
    files.sort()

    print ("Datasets collected and sorted. Processing...")


    try:
        for i, file in enumerate(files):
            current_file=DataSet(join(path_to_directory, file))
            
            print ("Printing metadata for current file...done.")
            print_dict(current_file.metadata)
            print (" Loading file # ", i, " : ", file, ". Please wait.")
            for building_number in range(1, len(current_file.buildings)+1):
    #Examine metadata for a single house
                elec=current_file.buildings[building_number].elec
                print ("The dataset being processed is : ", elec.dataset())
                print ("Metadata for current file: ")
                print_dict(current_file.buildings[building_number].metadata)
                print ("Appliance label information: ", elec.appliance_label())
                #print (elec.appliances)
                print ("Appliances:- ")
                for i in elec.appliances:
                    print (i)

                print ("Examining sub-metered appliances...")
                
                
                print ("Collecting stats on meters...Done.")
                print (elec._collect_stats_on_all_meters)
                
                print ("Timeframe: ", elec.get_timeframe())
                
                
                
                
                print ("Available power AC types: ", elec.available_power_ac_types())
                
                print ("Clearing cache...done.")
                elec.clear_cache()
                
                print ("Testing if there are meters from multiple buildings. Result returned by method: ", elec.contains_meters_from_multiple_buildings())
                
                # TODO: Find a better way to test the correlation function
                # print ("Testing the correlation function. ", elec.correlation(elec))
                
                
                print ("List of disabled meters: ", elec.disabled_meters)
                print ("Trying to determine the dominant appliance: ")
                try:
                    elec.dominant_appliance()
                except RuntimeError:
                    print ('''More than one dominant appliance in MeterGroup! (The dominant appliance per meter should be manually specified in the metadata. If it isn't and if there are multiple appliances for a meter then NILMTK assumes all appliances on that meter are dominant. NILMTK can't automatically distinguish between multiple appliances on the same meter (at least, not without using NILM!))''')
                    pass
                print ("Dropout rate: ", elec.dropout_rate())
                try:
                    print ("Calculating energy per meter:")
                    print (elec.energy_per_meter())
                
                    print ("Calculating total entropy")
                    print (elec.entropy())
                
                    print ("Calculating entropy per meter: ")
                    print (elec.entropy_per_meter())
                except ValueError:
                    print ("ValueError: Total size of array must remain unchanged.")
                    pass
                
                print ("Calculating fraction per meter.")
                print (elec.fraction_per_meter())

                
                

#print ("Average energy per period: ", elec.average_energy_per_period())
                
                
                print ("Executing functions...")
                lis=[]
                func=""
                '''for function in dir(elec):
                    try:
                        start=time.time()
                        if ("__" not in function or "dataframe_of_meters" not in function):
                            func=getattr(elec, function)
                        print ("Currently executing ", function, ". Please wait...")
                        print (func())
                        # print ("cProfile stats - printed")
                        # cProfile.run("func")
                        end=time.time()
                        print ("Time taken for the entire process : ", (end - start))
                    except AttributeError:
                        print ("Attribute error occured. ")
                    except TypeError:
                        lis.append(function)
                        print ("Warning: TypeError")
                        pass'''
                
                print ("Plotting wiring hierarchy of meters....")
                elec.draw_wiring_graph()
                ## DISAGGREGATION STARTS HERE
                appliance_type="unknown"
    #TODO : appliance_type should cycle through all appliances and check for each of them. For this, use a list.
                selected_appliance=nilmtk.global_meter_group.select_using_appliances(type=appliance_type)
                appliance_restricted = MeterGroup(selected_appliance.meters)
                if ((appliance_restricted.proportion_of_upstream_total_per_meter()) is not None):
                    proportion_per_appliance = appliance_restricted.proportion_of_upstream_total_per_meter()


                    proportion_per_appliance.plot(kind='bar');
                    plt.title('Appliance energy as proportion of total building energy');
                    plt.ylabel('Proportion');
                    plt.xlabel('Appliance (<appliance instance>, <building instance>, <dataset name>)');
                    selected_appliance.select(building=building_number).total_energy()
                    selected_appliance.select(building=1).plot();


                    appliance_restricted = MeterGroup(selected_appliance.meters)
                    daily_energy = pd.DataFrame([meter.average_energy_per_period(offset_alias='D')
                                     for meter in appliance_restricted.meters])

                    daily_energy.plot(kind='hist');
                    plt.title('Histogram of daily energy');
                    plt.xlabel('energy (kWh)');
                    plt.ylabel('Occurences');
                    plt.legend().set_visible(False)
                    
                    current_file.store.window=TimeFrame(start='2012-04-01 00:00:00-05:00', end='2012-04-02 00:00:00-05:00')
                    #elec.plot();

                    fraction = elec.submeters().fraction_per_meter().dropna()

                    labels = elec.get_appliance_labels(fraction.index)
                    plt.figure(figsize=(8,8))
                    fraction.plot(kind='pie', labels=labels);

                    elec.select_using_appliances(category='heating')
                    elec.select_using_appliances(category='single-phase induction motor')


                    co = CombinatorialOptimisation()
                    co.train(elec)

                    for model in co.model:
                        print_dict(model)


                    disag_filename = join(data_dir, 'ampds-disag.h5')
                    output = HDFDataStore(disag_filename, 'w')
                    co.disaggregate(elec.mains(), output)
                    output.close()



                    disag = DataSet(disag_filename)








                    disag_elec = disag.buildings[building_number].elec

                    f1 = f1_score(disag_elec, elec)
                    f1.index = disag_elec.get_appliance_labels(f1.index)
                    f1.plot(kind='bar')
                    plt.xlabel('appliance');
                    plt.ylabel('f-score');
                    disag_elec.plot()

                    disag.store.close()
    except AttributeError:
        print ("AttributeError occured while executing. This means that the value returned by  proportion_per_appliance = appliance_restricted.proportion_of_upstream_total_per_meter() is None")
        pass
Beispiel #28
0
def convert_eco(dataset_loc, hdf_filename, timezone):
    """
    Parameters:
    -----------
    dataset_loc: str
        The root directory where the dataset is located.
    hdf_filename: str
        The location where the hdf_filename is present. 
        The directory location has to contain the 
        hdf5file name for the converter to work.
    timezone: str
        specifies the timezone of the dataset.
    """

    # Creating a new HDF File
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc')    
    
    check_directory_exists(dataset_loc)
    directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i]
    directory_list.sort()
    print directory_list

    # Traversing every folder
    for folder in directory_list:

        if folder[0] == '.' or folder[-3:] == '.h5':
            print 'Skipping ', folder
            continue
        print 'Computing for folder',folder

        #Building number and meter_flag
        building_no = int(folder[:2])
        meter_flag = 'sm' if 'sm_csv' in folder else 'plugs'

        dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))]
        dir_list.sort()
        print 'Current dir list:',dir_list

        for fl in dir_list:
            
            print 'Computing for folder ',fl
            
            fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i]
            fl_dir_list.sort()

            if meter_flag == 'sm':
                for fi in fl_dir_list:
                    df = pd.read_csv(join(dataset_loc,folder,fl,fi), names=[i for i in range(1,17)], dtype=np.float32)
                    
                    for phase in range(1,4):
                        key = str(Key(building=building_no, meter=phase))
                        df_phase = df.ix[:,[1+phase, 5+phase, 8+phase, 13+phase]]

                        # get reactive power
                        power = df_phase.as_matrix([1+phase, 13+phase])
                        reactive = power[:,0] * np.tan(power[:,1] * np.pi / 180)
                        df_phase['Q'] = reactive
                        
                        df_phase.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz='GMT')
                        df_phase = df_phase.tz_convert(timezone)
                        
                        sm_column_name = {1+phase:('power', 'active'),
                                            5+phase:('current', ''),
                                            8+phase:('voltage', ''),
                                            13+phase:('phase_angle', ''),
                                            'Q': ('power', 'reactive'),
                                            };
                        df_phase.rename(columns=sm_column_name, inplace=True)
                        
                        tmp_before = np.size(df_phase.power.active)
                        df_phase = df_phase[df_phase.power.active != -1]
                        tmp_after = np.size(df_phase.power.active)
                        if (tmp_before != tmp_after):
                            print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after))
                        
                        df_phase.columns.set_names(LEVEL_NAMES, inplace=True)
                        if not key in store:
                            store.put(key, df_phase, format='Table')
                        else:
                            store.append(key, df_phase, format='Table')
                            store.flush()
                        print 'Building',building_no,', Meter no.',phase,'=> Done for ',fi[:-4]
                
            else:
                #Meter number to be used in key
                meter_num = int(fl) + 3
                
                key = str(Key(building=building_no, meter=meter_num))
                
                #Getting dataframe for each csv file seperately
                for fi in fl_dir_list:
                    df = pd.read_csv(join(dataset_loc,folder,fl ,fi), names=[1], dtype=np.float64)
                    df.index = pd.DatetimeIndex(start=fi[:-4], freq='s', periods=86400, tz = 'GMT')
                    df.rename(columns=plugs_column_name, inplace=True)
                    df = df.tz_convert(timezone)
                    df.columns.set_names(LEVEL_NAMES, inplace=True)

                    tmp_before = np.size(df.power.active)
                    df = df[df.power.active != -1]
                    tmp_after = np.size(df.power.active)
                    if (tmp_before != tmp_after):
                        print('Removed missing measurements - Size before: ' + str(tmp_before) + ', size after: ' + str(tmp_after))
                    
                    # If table not present in hdf5, create or else append to existing data
                    if not key in store:
                        store.put(key, df, format='Table')
                        print 'Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4]
                    else:
                        store.append(key, df, format='Table')
                        store.flush()
                        print 'Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4]
            
    print "Data storage completed."
    store.close()

    # Adding the metadata to the HDF5file
    print "Proceeding to Metadata conversion..."
    meta_path = join(_get_module_directory(), 'metadata')
    convert_yaml_to_hdf5(meta_path, hdf_filename)
    print "Completed Metadata conversion."
Beispiel #29
0
def _convert(input_path, store, measurement_mapping_func, tz, sort_index=True):
    """
    Parameters
    ----------
    input_path : str
        The root path of the REDD low_freq dataset.
    store : DataStore
        The NILMTK DataStore object.
    measurement_mapping_func : function
        Must take these parameters:
            - house_id
            - chan_id
        Function should return a list of tuples e.g. [('power', 'active')]
    tz : str
        Timezone e.g. 'US/Eastern'
    sort_index : bool
    """

    check_directory_exists(input_path)

    houses = _find_all_houses(input_path)
    years = []

    # Iterating though all Homes
    b_cnt = 0
    for house_id in houses:
        b_cnt = b_cnt + 1
        print('Loading Home:', house_id, end='... ')
        stdout.flush()
        years = _find_year(input_path, house_id)
        meters_paths_csv = []
        df_all_years = pd.DataFrame()

        for y in years:
            mains_df = pd.DataFrame()
            meters_paths_csv = _find_all_csv_paths(input_path, house_id, y)
            data_frames = []
            if not meters_paths_csv:
                continue
            else:
                k = 1

                for path in meters_paths_csv:
                    # 1.Concat csv files of all meters in each year, to get all
                    # appliances in 1 dataframe per year
                    temp_df = pd.read_csv(path)
                    if k == 1:
                        k = 0
                        if 'use [kW]' in temp_df.columns:
                            mains_df = temp_df['use [kW]']
                        elif 'Usage [kW]' in temp_df.columns:
                            mains_df = temp_df['Usage [kW]']
                        if 'Date & Time' in temp_df.columns:
                            date_time_df = temp_df['Date & Time']

                # Preprocess/clean dataframe by removing unusabe columns
                    temp_df = _preprocess_csv(temp_df)
                    data_frames.append(temp_df)

                df_year = reduce(
                    lambda left, right: left.join(
                        right, lsuffix='_1', rsuffix='_2'), data_frames)
                # Add columns 'Date & Time' and 'use [kW]'
                df_year.insert(0, 'Date & Time', date_time_df)
                df_year.insert(1, 'use', mains_df)
                # Append all years data to 1 dataframe
                df_all_years = df_all_years.append(df_year,
                                                   ignore_index=True,
                                                   sort=False)
        # Change index to datetime format
        df_all_years['Date & Time'] = pd.to_datetime(
            df_all_years['Date & Time'], utc=True)
        df_all_years.set_index('Date & Time', inplace=True)
        df_all_years = df_all_years.tz_convert('US/Eastern')

        # Append key value pairs to DataStore
        chan_id = 0
        for col in df_all_years.columns:
            chan_id += 1
            print(chan_id, end=' ')
            stdout.flush()
            key = Key(building=b_cnt, meter=chan_id)
            chan_df = pd.DataFrame(df_all_years[col])
            chan_df.columns = pd.MultiIndex.from_tuples([('power', 'active')])
            chan_df.columns.set_names(LEVEL_NAMES, inplace=True)

            store.put(str(key), chan_df)
        print()
Beispiel #30
0
def convert_eco(dataset_loc, hdf_filename, timezone):
	"""
	Parameters:
	-----------
	dataset_loc: str
		The root directory where the dataset is located.
	hdf_filename: str
		The location where the hdf_filename is present. 
                The directory location has to contain the 
		hdf5file name for the converter to work.
	timezone: str
		specifies the timezone of the dataset.
	"""

	# Creating a new HDF File
	store = pd.HDFStore(hdf_filename, 'w')

        check_directory_exists(dataset_loc)
	directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i]
	directory_list.sort()
	print directory_list

	# Traversing every folder
	for folder in directory_list:
		print 'Computing for folder',folder

		#Building number and meter_flag
		building_no = int(folder[:2])
		meter_flag = 'sm' if 'sm_csv' in folder else 'plugs'

		dir_list = [i for i in listdir(join(dataset_loc, folder)) if isdir(join(dataset_loc,folder,i))]
		dir_list.sort()
		print 'Current dir list:',dir_list

		for fl in dir_list:
			#Meter number to be used in key
			meter_num = 1 if meter_flag == 'sm' else int(fl) + 1

			print 'Computing for Meter no.',meter_num

			fl_dir_list = [i for i in listdir(join(dataset_loc,folder,fl)) if '.csv' in i]
			fl_dir_list.sort()

			key = Key(building=building_no, meter=meter_num)

			for fi in fl_dir_list:

				#Getting dataframe for each csv file seperately
				df_fl = _get_df(join(dataset_loc,folder,fl),fi,meter_flag)
				df_fl.sort_index(ascending=True,inplace=True)
				df_fl = df_fl.tz_convert(timezone)

				# If table not present in hdf5, create or else append to existing data
				if not key in store:
					store.put(str(key), df_fl, format='Table')
				else:
					store.append(str(key), df_fl, format='Table')
				store.flush()
				print 'Building',building_no,', Meter no.',meter_num,'=> Done for ',fi[:-4]

	print "Data storage completed."
	store.close()

	# Adding the metadata to the HDF5file
	print "Proceeding to Metadata conversion..."
	meta_path = join(_get_module_directory(), 'metadata')
	convert_yaml_to_hdf5(meta_path, hdf_filename)
	print "Completed Metadata conversion."
def convert_eco(dataset_loc, hdf_filename, timezone):
    """
    Parameters:
    -----------
    dataset_loc: str
        The root directory where the dataset is located.
    hdf_filename: str
        The location where the hdf_filename is present. 
        The directory location has to contain the 
        hdf5file name for the converter to work.
    timezone: str
        specifies the timezone of the dataset.
    """

    # Creating a new HDF File
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc')

    check_directory_exists(dataset_loc)
    directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i]
    directory_list.sort()
    print(directory_list)

    # Traversing every folder
    for folder in directory_list:

        if folder[0] == '.' or folder[-3:] == '.h5':
            print('Skipping ', folder)
            continue
        print('Computing for folder', folder)

        #Building number and meter_flag
        building_no = int(folder[:2])
        meter_flag = 'sm' if 'sm_csv' in folder else 'plugs'

        dir_list = [
            i for i in listdir(join(dataset_loc, folder))
            if isdir(join(dataset_loc, folder, i))
        ]
        dir_list.sort()
        print('Current dir list:', dir_list)

        for fl in dir_list:

            print('Computing for folder ', fl)

            fl_dir_list = [
                i for i in listdir(join(dataset_loc, folder, fl))
                if '.csv' in i
            ]
            fl_dir_list.sort()

            if meter_flag == 'sm':
                for fi in fl_dir_list:
                    df = pd.read_csv(join(dataset_loc, folder, fl, fi),
                                     names=[i for i in range(1, 17)],
                                     dtype=np.float32)

                    for phase in range(1, 4):
                        key = str(Key(building=building_no, meter=phase))
                        df_phase = df.ix[:, [
                            1 + phase, 5 + phase, 8 + phase, 13 + phase
                        ]]

                        # get reactive power
                        power = df_phase.as_matrix([1 + phase, 13 + phase])
                        reactive = power[:, 0] * np.tan(
                            power[:, 1] * np.pi / 180)
                        df_phase['Q'] = reactive

                        df_phase.index = pd.DatetimeIndex(start=fi[:-4],
                                                          freq='s',
                                                          periods=86400,
                                                          tz='GMT')
                        df_phase = df_phase.tz_convert(timezone)

                        sm_column_name = {
                            1 + phase: ('power', 'active'),
                            5 + phase: ('current', ''),
                            8 + phase: ('voltage', ''),
                            13 + phase: ('phase_angle', ''),
                            'Q': ('power', 'reactive'),
                        }
                        df_phase.rename(columns=sm_column_name, inplace=True)

                        tmp_before = np.size(df_phase.power.active)
                        df_phase = df_phase[df_phase.power.active != -1]
                        tmp_after = np.size(df_phase.power.active)
                        if (tmp_before != tmp_after):
                            print(
                                'Removed missing measurements - Size before: '
                                + str(tmp_before) + ', size after: ' +
                                str(tmp_after))

                        df_phase.columns.set_names(LEVEL_NAMES, inplace=True)
                        if not key in store:
                            store.put(key, df_phase, format='Table')
                        else:
                            store.append(key, df_phase, format='Table')
                            store.flush()
                        print('Building', building_no, ', Meter no.', phase,
                              '=> Done for ', fi[:-4])

            else:
                #Meter number to be used in key
                meter_num = int(fl) + 3

                key = str(Key(building=building_no, meter=meter_num))

                #Getting dataframe for each csv file seperately
                for fi in fl_dir_list:
                    df = pd.read_csv(join(dataset_loc, folder, fl, fi),
                                     names=[1],
                                     dtype=np.float64)
                    df.index = pd.DatetimeIndex(start=fi[:-4],
                                                freq='s',
                                                periods=86400,
                                                tz='GMT')
                    df.rename(columns=plugs_column_name, inplace=True)
                    df = df.tz_convert(timezone)
                    df.columns.set_names(LEVEL_NAMES, inplace=True)

                    tmp_before = np.size(df.power.active)
                    df = df[df.power.active != -1]
                    tmp_after = np.size(df.power.active)
                    if (tmp_before != tmp_after):
                        print('Removed missing measurements - Size before: ' +
                              str(tmp_before) + ', size after: ' +
                              str(tmp_after))

                    # If table not present in hdf5, create or else append to existing data
                    if not key in store:
                        store.put(key, df, format='Table')
                        print('Building', building_no, ', Meter no.',
                              meter_num, '=> Done for ', fi[:-4])
                    else:
                        store.append(key, df, format='Table')
                        store.flush()
                        print('Building', building_no, ', Meter no.',
                              meter_num, '=> Done for ', fi[:-4])

    print("Data storage completed.")
    store.close()

    # Adding the metadata to the HDF5file
    print("Proceeding to Metadata conversion...")
    meta_path = join(_get_module_directory(), 'metadata')
    convert_yaml_to_hdf5(meta_path, hdf_filename)
    print("Completed Metadata conversion.")
Beispiel #32
0
def _convert(input_path,
             store,
             measurement_mapping_func,
             tz,
             valid_home_id,
             sort_index=True,
             drop_duplicates=False):
    """
    Parameters
    ----------
    input_path : str
        The root path of the ideal low_freq dataset.
    store : DataStore
        The NILMTK DataStore object.
    measurement_mapping_func : function
        Must take these parameters:
            - house_id
            - chan_id
        Function should return a list of tuples e.g. [('power', 'active')]
    tz : str 
        Timezone e.g. 'US/Eastern'
    sort_index : bool
        Defaults to True
    drop_duplicates : bool
        Remove entries with duplicated timestamp (keeps the first value)
        Defaults to False for backwards compatibility.
    """

    check_directory_exists(input_path)
    #each file containg mains/appliance is linked with a house and sensor id
    filename, houses, sensor, category = _find_all_houses_sensor(
        input_path, valid_home_id)
    assert (len(houses) == len(sensor))
    for id in range(len(houses)):
        if (category[id] == 'electric-appliance'):
            stdout.flush()
            key = Key(building=houses[id], meter=int(sensor[id]))
            csv_filename = join(input_path, filename[id])
            measurements = measurement_mapping_func(houses[id], sensor[id],
                                                    category[id])
            df = _load_csv(csv_filename,
                           measurements,
                           tz,
                           sort_index=sort_index,
                           drop_duplicates=drop_duplicates)
            store.put(str(key), df)

        elif (category[id] == 'electric-mains'):
            combined_meters = sensor[id].split('c')
            stdout.flush()
            key = Key(building=houses[id], meter=int(combined_meters[0]))
            csv_filename = join(input_path, filename[id])
            measurements = measurement_mapping_func(houses[id], sensor[id],
                                                    category[id])
            df = _load_csv(csv_filename,
                           measurements,
                           tz,
                           sort_index=sort_index,
                           drop_duplicates=drop_duplicates)
            store.put(str(key), df)
        print("Instance number:" + str(id))
        print("Loading for home id:" + str(houses[id]) + "and sensor id:" +
              sensor[id] + "........")
Beispiel #33
0
def convert_sortd(input_path, output_filename, format='HDF'):
    """Converts the dataset to NILMTK HDF5 format.

    For more information about the SOR test dataset, contact Samuel Marisa.

    Parameters
    ----------
    input_path : str
        The root path of the dataset.  It is assumed that the YAML
        metadata is in 'input_path/metadata'.
    output_filename : str
        The destination filename (including path and suffix).
    format : str
        format of output. Either 'HDF' or 'CSV'. Defaults to 'HDF'

    Example usage:
    --------------
    convert('/sortd', 'store.h5')
    """
    print(
        'Attempting to convert the SORTD dataset at %s into %s in NILMTK %s format...'
        % (input_path, output_filename, format))
    # Ensure that the input directory exists
    check_directory_exists(input_path)
    # Load the dataset metadata
    with open(join(input_path, 'metadata/dataset.yaml'), 'r') as stream:
        dataset_metadata = yaml.load(stream)
    # Open the datastore
    store = get_datastore(output_filename, format, mode='w')
    # Iterate through all building metadata files found in the dataset
    for metadata_file in glob.glob(
            join(input_path, 'metadata/building[0-9]*.yaml')):
        # Load the building metadata
        with open(metadata_file, 'r') as stream:
            metadata = yaml.load(stream)
        building_id = int(metadata['instance'])
        print('==> Loading building %d defined at %s. Please wait...' %
              (building_id, metadata_file))
        for meter_id, meter_data in metadata['elec_meters'].items():
            meter_id = int(meter_id)
            key = Key(building=building_id, meter=meter_id)
            # Load the raw data from the data location
            print('  - Loading meter %s from %s...' %
                  (meter_id, meter_data['data_location']))
            columns = [('power', 'active')]
            df = pd.read_csv(join(input_path, meter_data['data_location']),
                             sep=',',
                             names=columns,
                             dtype={m: np.float32
                                    for m in columns})
            # Convert the timestamp index column to timezone-aware datetime
            df.index = pd.to_datetime(df.index.values, unit='s', utc=True)
            df = df.tz_convert(dataset_metadata['timezone'])
            #df = pd.read_csv(join(input_path, db_file), sep=';', names=('Datetime', 'P1', 'P2', 'P3'), dtype={'P1': np.float64, 'P2': np.float64, 'P3': np.float64}, parse_dates=[1])
            print(df.info())
            print(df.head())
            #print(df.tail())
            print("  - Storing data under key %s in the datastore..." %
                  (str(key)))
            store.put(str(key), df)
        print("  - Building %s loaded!" % (building_id))
    print("Adding the metadata into the store...")
    save_yaml_to_datastore(join(input_path, 'metadata'), store)
    print("Closing the store...")
    store.close()
    print("Done converting SORTD dataset to HDF5!")
Beispiel #34
0
def convert_eco(dataset_loc, hdf_filename, timezone):
    """
    Parameters:
    -----------
    dataset_loc: str
        The root directory where the dataset is located.
    hdf_filename: str
        The location where the hdf_filename is present. 
        The directory location has to contain the 
        hdf5file name for the converter to work.
    timezone: str
        specifies the timezone of the dataset.
    """

    # Creating a new HDF File
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='blosc')

    check_directory_exists(dataset_loc)
    directory_list = [i for i in listdir(dataset_loc) if '.txt' not in i]
    directory_list.sort()
    print(directory_list)

    found_any_sm = False
    found_any_plug = False

    # Traversing every folder
    for folder in directory_list:
        if folder[0] == '.' or folder[-3:] == '.h5':
            print('Skipping ', folder)
            continue

        #Building number and meter_flag
        building_no = int(folder[:2])
        meter_flag = None
        if 'sm_csv' in folder:
            meter_flag = 'sm'
        elif 'plugs' in folder:
            meter_flag = 'plugs'
        else:
            print('Skipping folder', folder)
            continue

        print('Computing for folder', folder)

        dir_list = [
            i for i in listdir(join(dataset_loc, folder))
            if isdir(join(dataset_loc, folder, i))
        ]
        dir_list.sort()

        if meter_flag == 'plugs' and len(dir_list) < 3:
            # Try harder to find the subfolders
            folder = join(folder, folder[:2])
            dir_list = [
                i for i in listdir(join(dataset_loc, folder))
                if isdir(join(dataset_loc, folder, i))
            ]

        print('Current dir list:', dir_list)

        for fl in dir_list:
            print('Computing for folder ', fl)

            fl_dir_list = [
                i for i in listdir(join(dataset_loc, folder, fl))
                if '.csv' in i
            ]
            fl_dir_list.sort()

            if meter_flag == 'sm':
                for fi in fl_dir_list:
                    found_any_sm = True
                    df = pd.read_csv(join(dataset_loc, folder, fl, fi),
                                     names=[i for i in range(1, 17)],
                                     dtype=np.float32)
                    # SmartMeter
                    for phase in range(1, 4):
                        key = str(Key(building=building_no, meter=phase))
                        df_phase = df.loc[:, [
                            1 + phase, 5 + phase, 8 + phase, 13 + phase
                        ]]

                        # get reactive power
                        power = df_phase.loc[:, (1 + phase, 13 + phase)].values
                        reactive = power[:, 0] * np.tan(
                            power[:, 1] * np.pi / 180)
                        df_phase['Q'] = reactive

                        df_phase.index = pd.DatetimeIndex(start=fi[:-4],
                                                          freq='s',
                                                          periods=86400,
                                                          tz='GMT')
                        df_phase = df_phase.tz_convert(timezone)

                        sm_column_name = {
                            1 + phase: ('power', 'active'),
                            5 + phase: ('current', ''),
                            8 + phase: ('voltage', ''),
                            13 + phase: ('phase_angle', ''),
                            'Q': ('power', 'reactive'),
                        }
                        df_phase.columns = pd.MultiIndex.from_tuples(
                            sm_column_name[col] for col in df_phase.columns)

                        power_active = df_phase['power', 'active']
                        tmp_before = np.size(power_active)
                        df_phase = df_phase[power_active != -1]
                        power_active = df_phase['power', 'active']
                        tmp_after = np.size(power_active)

                        if tmp_before != tmp_after:
                            print(
                                'Removed missing measurements - Size before: '
                                + str(tmp_before) + ', size after: ' +
                                str(tmp_after))

                        df_phase.columns.set_names(LEVEL_NAMES, inplace=True)
                        if not key in store:
                            store.put(key, df_phase, format='Table')
                        else:
                            store.append(key, df_phase, format='Table')
                            store.flush()
                        print('Building', building_no, ', Meter no.', phase,
                              '=> Done for ', fi[:-4])
            # Plugs werden auch in Meter uebersetzt und dann aber direkt mit Appliances ergaenzt
            else:
                #Meter number to be used in key
                meter_num = int(fl) + 3

                key = str(Key(building=building_no, meter=meter_num))

                current_folder = join(dataset_loc, folder, fl)
                if not fl_dir_list:
                    raise RuntimeError("No CSV file found in " +
                                       current_folder)

                #Getting dataframe for each csv file seperately
                for fi in fl_dir_list:
                    found_any_plug = True
                    df = pd.read_csv(join(current_folder, fi),
                                     names=[1],
                                     dtype=np.float64)
                    df.index = pd.DatetimeIndex(start=fi[:-4].replace(
                        '.', ':'),
                                                freq='s',
                                                periods=86400,
                                                tz='GMT')
                    df.columns = pd.MultiIndex.from_tuples(
                        plugs_column_name.values())
                    df = df.tz_convert(timezone)
                    df.columns.set_names(LEVEL_NAMES, inplace=True)

                    # Check whether measurements removed
                    tmp_before = np.size(df.power.active)
                    df = df[df.power.active != -1]
                    tmp_after = np.size(df.power.active)
                    if (tmp_before != tmp_after):
                        print('Removed missing measurements - Size before: ' +
                              str(tmp_before) + ', size after: ' +
                              str(tmp_after))

                    # If table not present in hdf5, create or else append to existing data
                    if not key in store:
                        store.put(key, df, format='Table')
                        print('Building', building_no, ', Meter no.',
                              meter_num, '=> Done for ', fi[:-4])
                    else:
                        store.append(key, df, format='Table')
                        store.flush()
                        print('Building', building_no, ', Meter no.',
                              meter_num, '=> Done for ', fi[:-4])

    if not found_any_plug or not found_any_sm:
        raise RuntimeError(
            'The files were not found! Please check the folder structure. Extract each ZIP file into a folder with its base name (e.g. extract "01_plugs_csv.zip" into a folder named "01_plugs_csv", etc.)'
        )

    print("Data storage completed.")
    store.close()

    # Adding the metadata to the HDF5file
    print("Proceeding to Metadata conversion...")
    meta_path = join(get_module_directory(), 'dataset_converters', 'eco',
                     'metadata')
    convert_yaml_to_hdf5(meta_path, hdf_filename)
    print("Completed Metadata conversion.")
Beispiel #35
0
def convert_gjw(gjw_path, output_filename):
    """
    Parameters
    ----------
    gjw_path : str
        The root path of the gjw dataset.
    output_filename : str
        The destination filename (including path and suffix), will default if not specified
    directory and file structure
    nilm_gjw_data
        building<1>
            elec
                4-POWER_REAL_FINE <date> Dump.csv
                5-POWER_REACTIVE_STANDARD <date> Dump.csv
                ...
        ...
        building<n>
        HDF5
            nilm_gjw_data.hdf5
        metadata
            building1.yaml
            dataset.yaml
            meter_devices.yaml
        other files    
    """
    if gjw_path is None: gjw_path = home_dir
    check_directory_exists(gjw_path)
    os.chdir(gjw_path)
    gjw_path = os.getcwd(
    )  # sort out potential issue with slashes or backslashes
    if output_filename is None:
        output_filename = join(home_dir, 'HDF5', 'nilm_gjw_data.hdf5')
    # Open data store
    print('opening datastore', output_filename)
    store = get_datastore(output_filename, format, mode='w')
    # walk the directory tree from the dataset home directory
    #clear dataframe & add column headers
    df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME, REACTIVE_COLUMN_NAME])
    found = False
    for current_dir, _, files in os.walk(gjw_path):
        #unused second parameter of for dirs_in_current_dir
        if current_dir.find('.git') != -1 or current_dir.find('.ipynb') != -1:
            #print( 'Skipping ', current_dir)
            continue
        print('checking', current_dir)
        m = bld_re.search(current_dir)
        if m:  #The csv files may be further down the tree so this section may be repeated
            building_name = m.group()
            building_nbr = int(bld_nbr_re.search(building_name).group())
            meter_nbr = 1
            key = Key(building=building_nbr, meter=meter_nbr)
        for items in fnmatch.filter(files, "4*.csv"):
            # process any .CSV files found
            found = True
            ds = iso_date_re.search(items).group()
            # print( 'found files for date:', ds,end=" ")
            # found files to process
            df1 = _read_file_pair(current_dir,
                                  ds)  # read two csv files into a dataframe
            df = pd.concat(
                [df, df1])  # concatenate the results into one long dataframe
        if found:
            found = False
            df = _prepare_data_for_toolkit(df)
            _summarise_dataframe(df, 'Prepared for tool kit')
            store.put(str(key), df)
            #clear dataframe & add column headers
            #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME])
            break  # only 1 folder with .csv files at present
    store.close()
    convert_yaml_to_hdf5(join(gjw_path, 'metadata'), output_filename)
    print("Done converting gjw to HDF5!")