Beispiel #1
0
 def test_filter_one_col_two_values_positive_and_negative_ints(self):
     filter_cols = ('INTCOL', )
     filter_values = ([10, -5], )
     result = \
         filters.filter_on_column_value(self.int_filter_data, filter_cols, filter_values).reset_index(drop=True)
     aim = pd.DataFrame({'INTCOL': [10, -5, 10, 10]})
     assert_frame_equal(aim, result)
Beispiel #2
0
 def test_filter_one_empty_values_returns_empty_data_frame(self):
     filter_cols = ('INTCOL', )
     filter_values = ([], )
     result = \
         filters.filter_on_column_value(self.int_and_string_filter_data, filter_cols, filter_values).reset_index(drop=True)
     aim = pd.DataFrame({'INTCOL': [], 'STRINGCOL': []})
     aim = aim.astype(dtype={'INTCOL': np.int64, 'STRINGCOL': str})
     assert_frame_equal(aim, result)
Beispiel #3
0
 def test_filter_two_cols_one_value_each_not_matching(self):
     filter_cols = ('INTCOL', 'STRINGCOL')
     filter_values = ([10], ['100'])
     result = \
         filters.filter_on_column_value(self.int_and_string_filter_data, filter_cols, filter_values).reset_index(drop=True)
     aim = pd.DataFrame({'INTCOL': [], 'STRINGCOL': []})
     aim = aim.astype(dtype={'INTCOL': np.int64, 'STRINGCOL': str})
     assert_frame_equal(aim, result)
Beispiel #4
0
 def test_filter_just_one_of_two_cols(self):
     filter_cols = ('INTCOL', )
     filter_values = ([10], )
     result = \
         filters.filter_on_column_value(self.int_and_string_filter_data, filter_cols, filter_values).reset_index(drop=True)
     aim = pd.DataFrame({
         'INTCOL': [10, 10, 10],
         'STRINGCOL': ['10', '10', '10']
     })
     assert_frame_equal(aim, result)
def static_table_xl(start_time, end_time, table_name, raw_data_location, select_columns=None, filter_cols=None,
                    filter_values=None):
    path_and_name = raw_data_location + '/' + defaults.names[table_name] + '.xls'
    print('Retrieving static table {}.'.format(table_name))
    if not os.path.isfile(path_and_name):
        print('Downloading data for table {}.'.format(table_name))
        downloader.download_xl(defaults.static_table_url[table_name], raw_data_location, path_and_name)
    xls = pd.ExcelFile(path_and_name)
    table = pd.read_excel(xls, 'Generators and Scheduled Loads', dtype=str)
    table = table.loc[:, select_columns]
    if filter_cols is not None:
        table = filters.filter_on_column_value(table, filter_cols, filter_values)
    table = table.drop_duplicates(['DUID'])

    return table
def static_table(start_time, end_time, table_name, raw_data_location, select_columns=None, filter_cols=None,
                 filter_values=None):
    print('Retrieving static table {}.'.format(table_name))
    path_and_name = raw_data_location + '/' + defaults.names[table_name]
    if not os.path.isfile(path_and_name):
        print('Downloading data for table {}.'.format(table_name))
        downloader.download_csv(defaults.static_table_url[table_name], raw_data_location, path_and_name)

    table = pd.read_csv(raw_data_location + '/' + defaults.names[table_name], dtype=str,
                        names=defaults.table_columns[table_name])
    if select_columns is not None:
        table = table.loc[:, select_columns]
    for column in table.select_dtypes(['object']).columns:
        table[column] = table[column].map(lambda x: x.strip())

    if filter_cols is not None:
        table = filters.filter_on_column_value(table, filter_cols, filter_values)

    return table
def dynamic_data_compiler(start_time, end_time, table_name, raw_data_location, select_columns=None, filter_cols=None,
                          filter_values=None):
    print('Compiling data for table {}.'.format(table_name))
    # Generic setup common to all tables.
    if select_columns is None:
        select_columns = defaults.table_columns[table_name]

    # Pre loop setup, done at table type basis.
    date_filter = processing_info_maps.filter[table_name]
    setup_function = processing_info_maps.setup[table_name]

    if setup_function is not None:
        start_time, end_time = setup_function(start_time, end_time)

    search_type = processing_info_maps.search_type[table_name]

    if search_type == 'all':
        start_search = defaults.nem_data_model_start_time
    elif search_type == 'start_to_end':
        start_search = start_time

    start_time = datetime.strptime(start_time, '%Y/%m/%d %H:%M:%S')
    end_time = datetime.strptime(end_time, '%Y/%m/%d %H:%M:%S')
    start_search = datetime.strptime(start_search, '%Y/%m/%d %H:%M:%S')

    data_tables = dynamic_data_fetch_loop(start_search, start_time, end_time, table_name, raw_data_location,
                                          select_columns, date_filter, search_type)

    all_data = pd.concat(data_tables, sort=False)

    finalise_data = processing_info_maps.finalise[table_name]
    if finalise_data is not None:
        for function in finalise_data:
            all_data = function(all_data, start_time, table_name)

    if filter_cols is not None:
        all_data = filters.filter_on_column_value(all_data, filter_cols, filter_values)

    return all_data
Beispiel #8
0
def fcas4s_scada_match(start_time,
                       end_time,
                       table_name,
                       raw_data_location,
                       select_columns=None,
                       filter_cols=None,
                       filter_values=None):

    # Pull in the 4 second fcas data.
    table_name_fcas4s = 'FCAS_4_SECOND'
    fcas4s = data_fetch_methods.dynamic_data_compiler(start_time, end_time,
                                                      table_name_fcas4s,
                                                      raw_data_location)
    # Pull in the 4 second fcas variable types.
    table_name_variable_types = 'VARIABLES_FCAS_4_SECOND'
    fcas4s_variable_types = data_fetch_methods.static_table(
        start_time, end_time, table_name_variable_types, raw_data_location)

    # Select the variable types that measure MW on an interconnector and Gen_MW from a dispatch unit.
    fcas4s_variable_types = fcas4s_variable_types[
        fcas4s_variable_types['VARIABLETYPE'].isin(['MW', 'Gen_MW'])]
    fcas4s = fcas4s[fcas4s['VARIABLENUMBER'].isin(
        fcas4s_variable_types['VARIABLENUMBER'])]

    # Select just the fcas 4 second data variable columns that we need.
    fcas4s = fcas4s.loc[:, ('TIMESTAMP', 'ELEMENTNUMBER', 'VALUE')]

    # Convert the fcas MW measured values to numeric type.
    fcas4s['VALUE'] = pd.to_numeric(fcas4s['VALUE'])

    # Rename the 4 second measurements to the timestamp of the start of the 5 min interval i.e round down to nearest
    # 5 min interval.
    fcas4s = fcas4s[(fcas4s['TIMESTAMP'].dt.minute.isin(list(range(0, 60, 5))))
                    & (fcas4s['TIMESTAMP'].dt.second < 20)]
    fcas4s['TIMESTAMP'] = fcas4s['TIMESTAMP'].apply(
        lambda dt: datetime(dt.year, dt.month, dt.day, dt.hour, dt.minute))

    # Pull in the dispatch unit scada data.
    table_name_scada = 'DISPATCH_UNIT_SCADA'
    scada = data_fetch_methods.dynamic_data_compiler(start_time, end_time,
                                                     table_name_scada,
                                                     raw_data_location)
    scada['SETTLEMENTDATE'] = scada['SETTLEMENTDATE'] - timedelta(minutes=5)
    scada = scada.loc[:, ('SETTLEMENTDATE', 'DUID', 'SCADAVALUE')]
    scada.columns = ['SETTLEMENTDATE', 'MARKETNAME', 'SCADAVALUE']
    scada['SCADAVALUE'] = pd.to_numeric(scada['SCADAVALUE'])

    # Pull in the interconnector scada data and use the intervention records where the exist.
    table_name_inter_flow = 'DISPATCHINTERCONNECTORRES'
    inter_flows = data_fetch_methods.dynamic_data_compiler(
        start_time, end_time, table_name_inter_flow, raw_data_location)
    inter_flows['METEREDMWFLOW'] = pd.to_numeric(inter_flows['METEREDMWFLOW'])
    inter_flows = inter_flows.sort_values('INTERVENTION')
    inter_flows = inter_flows.groupby(['SETTLEMENTDATE', 'INTERCONNECTORID'],
                                      as_index=False).last()
    inter_flows = inter_flows.loc[:, ('SETTLEMENTDATE', 'INTERCONNECTORID',
                                      'METEREDMWFLOW')]
    inter_flows['SETTLEMENTDATE'] = inter_flows['SETTLEMENTDATE'] - timedelta(
        minutes=5)
    inter_flows.columns = ['SETTLEMENTDATE', 'MARKETNAME', 'SCADAVALUE']

    # Combine scada data from interconnectors and dispatch units.
    scada_elements = pd.concat([scada, inter_flows], sort=False)

    # Merge the fcas and scada data based on time stamp, these leads every scada element to be joined to every fcas
    # element that then allows them to be comapred.
    profile_comp = pd.merge(fcas4s,
                            scada_elements,
                            'inner',
                            left_on='TIMESTAMP',
                            right_on='SETTLEMENTDATE')

    # Calculate the error between each measurement.
    profile_comp['ERROR'] = profile_comp['VALUE'] - profile_comp['SCADAVALUE']
    profile_comp['ERROR'] = profile_comp['ERROR'].abs()

    # Choose the fcas values that best matches the scada value during the 5 min interval.
    profile_comp = profile_comp.sort_values('ERROR')
    error_comp = profile_comp.groupby(
        ['MARKETNAME', 'ELEMENTNUMBER', 'TIMESTAMP'], as_index=False).first()

    # Aggregate the error to comapre each scada and fcas element potential match.
    error_comp = error_comp.groupby(['MARKETNAME', 'ELEMENTNUMBER'],
                                    as_index=False).sum()

    # Sort the comparisons based on aggregate error.
    error_comp = error_comp.sort_values('ERROR')

    # Drop duplicates of element numbers and scada element names, keeping the record for each with the least error.
    best_matches_scada = error_comp[
        error_comp['SCADAVALUE'].abs() >
        0]  # Don't include units 0 values for scada
    best_matches_scada = best_matches_scada.drop_duplicates('ELEMENTNUMBER',
                                                            keep='first')
    best_matches_scada = best_matches_scada.drop_duplicates('MARKETNAME',
                                                            keep='first')

    # Remove fcas elements where a match only occurred because both fcas and scada showed no dispatch.
    best_matches_scada['ELEMENTNUMBER'] = pd.to_numeric(
        best_matches_scada['ELEMENTNUMBER'])
    best_matches_scada = best_matches_scada.sort_values('ELEMENTNUMBER')
    best_matches_scada['ELEMENTNUMBER'] = best_matches_scada[
        'ELEMENTNUMBER'].astype(str)

    # Give error as a percentage.
    best_matches_scada['ERROR'] = best_matches_scada[
        'ERROR'] / best_matches_scada['SCADAVALUE']

    # drop matches with error greater than 100 %
    best_matches_scada = best_matches_scada[(best_matches_scada['ERROR'] < 1) &
                                            (best_matches_scada['ERROR'] > -1)]

    best_matches_scada = best_matches_scada.loc[:, ('ELEMENTNUMBER',
                                                    'MARKETNAME', 'ERROR')]

    if select_columns is not None:
        best_matches_scada = best_matches_scada.loc[:, select_columns]

    if filter_cols is not None:
        best_matches_scada = filters.filter_on_column_value(
            best_matches_scada, filter_cols, filter_values)

    return best_matches_scada