Example #1
0
def stay_imputation_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the stay imputation steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_stay_imputation()

    # Populate Survey Data For Stay Imputation
    idm.populate_survey_data_for_step(run_id, config)

    # Copy Stay Imp PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply Stay Imp PVs On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_STAY_SPV',
                              in_id='serial')

    # Update Survey Data with Stay Imp PV Output
    idm.update_survey_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    # Calculate Stay Imputation
    survey_data_out = calculate_stay_imputation.do_ips_stay_imputation(
        survey_data, var_serial='SERIAL', num_levels=1, measure='mean')

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], survey_data_out)

    # Update Survey Data With Stay Imp Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With Stay Imp Results
    idm.store_survey_data_with_step_results(run_id, config)
Example #2
0
def town_stay_expenditure_imputation_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the town stay expenditure imputation steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_town_and_stay_expenditure()

    # Populate Survey Data For TSE Imputation
    idm.populate_survey_data_for_step(run_id, config)

    # Copy TSE Imputation PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply TSE Imputation PVs On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_TOWN_STAY_SPV',
                              in_id='serial')

    # Update Survey Data with TSE Imputation PV Output
    idm.update_survey_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    # Calculate TSE Imputation
    survey_data_out = calculate_town_and_stay_expenditure.do_ips_town_exp_imp(
        survey_data, var_serial="SERIAL", var_final_wt="FINAL_WT")

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], survey_data_out)

    # Update Survey Data With TSE Imputation Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With TSE Imputation Results
    idm.store_survey_data_with_step_results(run_id, config)
def final_weight_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the final weight steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_final_weight()

    # Populate Survey Data For Final Wt
    idm.populate_survey_data_for_step(run_id, config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    # Calculate Final Weight
    survey_data_out, summary_data_out = \
        calculate_final_weight.do_ips_final_wt_calculation(survey_data,
                                                           serial_num='SERIAL',
                                                           shift_weight='SHIFT_WT',
                                                           non_response_weight='NON_RESPONSE_WT',
                                                           min_weight='MINS_WT',
                                                           traffic_weight='TRAFFIC_WT',
                                                           unsampled_weight='UNSAMP_TRAFFIC_WT',
                                                           imbalance_weight='IMBAL_WT',
                                                           final_weight='FINAL_WT')

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], survey_data_out)
    db.insert_dataframe_into_table(config["sas_ps_table"], summary_data_out)

    # Update Survey Data With Final Wt Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With Final Wt Results
    idm.store_survey_data_with_step_results(run_id, config)

    # Store Final Weight Summary
    idm.store_step_summary(run_id, config)
Example #4
0
def import_survey_data_into_database(survey_data_path, run_id):
    """
    Author       : (pinched from) Thomas Mahoney (modified by) Elinor Thorne
    Date         : (26/04/ 2018) 23/08/2018
    Purpose      : Loads the dataimport data into 'SURVEY_SUBSAMPLE' table on the connected database.
    Parameters   : survey_data_path - the dataframe containing all of the dataimport data.
    Returns      : NA
    Requirements : Datafile is of type '.csv', '.pkl' or '.sas7bdat'
    """

    start_time = time.time()

    # Check the survey_data_path's suffix to see what it matches then extract using the appropriate method.
    # TODO:
    # fares_imputation originally had: df_survey_data = pd.read_csv(survey_data_path, encoding='ANSI', dtype=str)
    # imbalance_weight originally had: df_survey_data = pd.read_csv(survey_data_path, encoding='ANSI', dtype=str)
    # rail_imputation originally had: df_survey_data = pd.read_csv(survey_data_path, encoding='ANSI', dtype=str)
    # spend_imputation originally had: df_survey_data = pd.read_csv(survey_data_path, encoding='ANSI', dtype=str)
    # town_and_stay originally had: df_survey_data = pd.read_csv(survey_data_path, encoding='ANSI', dtype=str)
    # unsampled_weight originally had: df_survey_data = pd.read_csv(survey_data_path, encoding='ANSI', dtype=str)
    # final_weight originally had: df_survey_data = pd.read_csv(survey_data_path)

    # df_survey_data = pd.read_csv(survey_data_path, engine='python')
    # TODO: Swap with reusable function
    df_survey_data = pd.read_csv(survey_data_path, engine='python')

    # Add the generated run id to the dataset.
    df_survey_data['RUN_ID'] = pd.Series(run_id, index=df_survey_data.index)

    # Cleanses Survey Subsample table.
    db.delete_from_table(idm.SURVEY_SUBSAMPLE_TABLE, 'RUN_ID', '=', run_id)

    # Insert the imported data into the survey_subsample table on the database.
    # fast=False causes arithmetic error
    db.insert_dataframe_into_table(idm.SURVEY_SUBSAMPLE_TABLE, df_survey_data)

    # Print Import runtime to record performance.
    print("Import runtime: {}".format(
        time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))))
def traffic_weight_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the traffic weight steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_traffic_weight()

    # Populate Survey Data For Traffic Wt
    idm.populate_survey_data_for_step(run_id, config)

    # Populate Traffic Data
    idm.populate_step_data(run_id, config)

    # Copy Traffic Wt PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply Traffic Wt PV On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_TRAFFIC_SPV',
                              in_id='serial')

    # Update Survey Data with Traffic Wt PV Output
    idm.update_survey_data_with_step_pv_output(config)

    # Copy Traffic Wt PVs For Traffic Data
    idm.copy_step_pvs_for_step_data(run_id, config)

    # Apply Traffic Wt PV On Traffic Data
    process_variables.process(dataset='traffic',
                              in_table_name='SAS_TRAFFIC_DATA',
                              out_table_name='SAS_TRAFFIC_PV',
                              in_id='REC_ID')

    # Update Traffic Data With Traffic Wt PV Output
    idm.update_step_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
    traffic_data = db.get_table_values(config["data_table"])

    # Calculate Traffic Weight
    output_data, summary_data = do_ips_trafweight_calculation_with_R(
        survey_data, traffic_data)

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], output_data)
    db.insert_dataframe_into_table(config["sas_ps_table"], summary_data)

    # Update Survey Data With Traffic Wt Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With Traffic Wt Results
    idm.store_survey_data_with_step_results(run_id, config)

    # Store Traffic Wt Summary
    idm.store_step_summary(run_id, config)
Example #6
0
def convert_dataframe_to_sql_format(table_name, dataframe):
    db.insert_dataframe_into_table(table_name, dataframe)
    return db.get_table_values(table_name)
Example #7
0
def do_ips_trafweight_calculation_with_R(survey_data, trtotals):
    # clear the auxillary tables
    db.delete_from_table(SURVEY_TRAFFIC_AUX_TABLE)

    # drop aux tables and r created tables
    # cf.drop_table(POP_PROWVEC_TABLE)
    # cf.drop_table(R_TRAFFIC_TABLE)
    db.clear_memory_table(R_TRAFFIC_TABLE)
    db.clear_memory_table(POP_PROWVEC_TABLE)

    # inserts into survey_traffic_aux a.k.a. SURVEY_TRAFFIC_AUX_TABLE
    df_r_ges_input_imported = r_survey_input(survey_data)
    # inserts into POP_PROWVEC_TABLE
    df_mod_pop_totals_import = r_population_input(survey_data, trtotals)

    run_r_ges_script()

    # grab the data from the SQL table and return
    output_final_import = db.get_table_values(R_TRAFFIC_TABLE)

    ret_out = output_final_import[[SERIAL, TRAFFIC_WT]]

    # sort
    ret_out_sorted = ret_out.sort_values(SERIAL)
    ret_out_final = ret_out_sorted.reset_index(drop=True)

    # copy out the df without random for generate_ips_tw_summary
    df_ret_out_final_not_rounded = ret_out_final.copy()

    # Round the weights to 3dp
    ret_out_final[TRAFFIC_WT] = ret_out_final[TRAFFIC_WT].apply(
        lambda x: round(x, 3))

    # #################################
    # Generate the summary table
    # #################################

    # perform calculation
    survey_data[TRAFFIC_DESIGN_WEIGHT_COLUMN] = survey_data[
        var_shiftWeight] * survey_data[var_NRWeight] * survey_data[
            var_minWeight]

    # Summarise the population totals over the strata
    df_PopTotals = trtotals.sort_values(STRATA)

    # Re-index the data frame
    df_PopTotals.index = range(df_PopTotals.shape[0])

    df_popTotals = df_PopTotals.groupby(STRATA)[TRAFFIC_TOTAL_COLUMN] \
        .agg([(TRAFFIC_TOTAL_COLUMN, 'sum')]) \
        .reset_index()

    # ensure unrounded df_ret_out_final_not_rounded is supplied
    df_summary_merge_sum_traftot = generate_ips_tw_summary(
        survey_data, df_ret_out_final_not_rounded, var_serialNum, GWeightVar,
        df_popTotals, minCountThresh)

    # update the output SQL tables
    db.insert_dataframe_into_table(OUTPUT_TABLE_NAME, ret_out_final)
    db.insert_dataframe_into_table(SUMMARY_TABLE_NAME,
                                   df_summary_merge_sum_traftot)

    return ret_out_final, df_summary_merge_sum_traftot
Example #8
0
 def insert(data: str):
     data_frame = pd.DataFrame(data, index=[0])
     db.insert_dataframe_into_table(table, data_frame, if_exists)
Example #9
0
def r_survey_input(survey_input: pd.DataFrame) -> None:
    """
    Author       : David Powell
    Date         : 07/06/2018
    Purpose      : Creates input data that feeds into the R GES weighting
    Parameters   : df_survey_input - A data frame containing the survey data for
                   processing month
    Returns      : A data frame containing the information needed for GES weighting
    Requirements : NA
    Dependencies : NA
    """

    # Load survey Data
    df_survey_input = survey_input

    # Sort input values
    sort1 = ['UNSAMP_PORT_GRP_PV', 'UNSAMP_REGION_GRP_PV', 'ARRIVEDEPART']

    df_survey_input_sorted = df_survey_input.sort_values(sort1)

    # Cleanse data
    df_survey_input_sorted.UNSAMP_REGION_GRP_PV.fillna(value=0, inplace=True)
    df_survey_input_sorted = df_survey_input_sorted[
        ~df_survey_input_sorted['UNSAMP_PORT_GRP_PV'].isnull()]
    df_survey_input_sorted = df_survey_input_sorted[
        ~df_survey_input_sorted['ARRIVEDEPART'].isnull()]

    # Create lookup. Group by and aggregate
    # lookup_dataframe = df_survey_input_copy
    lookup_dataframe = df_survey_input_sorted

    lookup_dataframe["count"] = ""
    lookup_dataframe = lookup_dataframe.groupby(
        ['UNSAMP_PORT_GRP_PV', 'UNSAMP_REGION_GRP_PV', 'ARRIVEDEPART']).agg({
            "count":
            'count'
        }).reset_index()

    # Cleanse data
    lookup_dataframe = lookup_dataframe.drop(["count"], axis=1)
    lookup_dataframe["T1"] = range(len(lookup_dataframe))
    lookup_dataframe["T1"] = lookup_dataframe["T1"] + 1

    # Merge lookup data in to source dataframe
    df_aux_variables = pd.merge(
        df_survey_input_sorted,
        lookup_dataframe,
        on=['UNSAMP_PORT_GRP_PV', 'UNSAMP_REGION_GRP_PV', 'ARRIVEDEPART'],
        how='left')

    # Create traffic design weight used within GES weighting
    values = df_aux_variables.SHIFT_WT * df_aux_variables.NON_RESPONSE_WT * df_aux_variables.MINS_WT * df_aux_variables.TRAFFIC_WT
    df_aux_variables['OOHDesignWeight'] = values
    df_aux_variables = df_aux_variables.sort_values(['SERIAL'])

    # Create input to pass into GES weighting
    df_r_ges_input = df_aux_variables[~df_aux_variables['T1'].isnull()]
    df_r_ges_input = df_r_ges_input[[
        'SERIAL', 'ARRIVEDEPART', 'PORTROUTE', 'SHIFT_WT', 'NON_RESPONSE_WT',
        'MINS_WT', 'UNSAMP_PORT_GRP_PV', 'UNSAMP_REGION_GRP_PV',
        'OOHDesignWeight', 'T1'
    ]]

    # # ROUND VALUES - Added to match SAS output
    df_r_ges_input.UNSAMP_REGION_GRP_PV = pd.to_numeric(
        df_r_ges_input.UNSAMP_REGION_GRP_PV, errors='coerce')

    db.insert_dataframe_into_table("survey_unsamp_aux", df_r_ges_input)

    df_aux_variables.drop(columns=['T1', 'OOHDesignWeight'], axis=1)
def unsampled_weight_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the unsampled weight steps of the ips process
    Params       : run_id - the id for the current run.
    Returns      : None
    """

    # Load configuration variables
    config = ServicesConfiguration().get_unsampled_weight()

    # Populate Survey Data For Unsampled Wt
    idm.populate_survey_data_for_step(run_id, config)

    # Populate Unsampled Data
    idm.populate_step_data(run_id, config)

    # Copy Unsampled Wt PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply Unsampled Wt PV On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_UNSAMPLED_OOH_SPV',
                              in_id='serial')

    # Update Survey Data with Unsampled Wt PV Output
    idm.update_survey_data_with_step_pv_output(config)

    # Copy Unsampled Wt PVs For Unsampled Data
    idm.copy_step_pvs_for_step_data(run_id, config)

    # Apply Unsampled Wt PV On Unsampled Data
    process_variables.process(dataset='unsampled',
                              in_table_name='SAS_UNSAMPLED_OOH_DATA',
                              out_table_name='SAS_UNSAMPLED_OOH_PV',
                              in_id='REC_ID')

    # Update Unsampled Data With PV Output
    idm.update_step_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
    unsampled_data = db.get_table_values(config["data_table"])

    # Calculate Unsampled Weight
    output_data, summary_data = calculate_unsampled_weight.do_ips_unsampled_weight_calculation(
        df_surveydata=survey_data,
        serial_num='SERIAL',
        shift_weight='SHIFT_WT',
        nr_weight='NON_RESPONSE_WT',
        min_weight='MINS_WT',
        traffic_weight='TRAFFIC_WT',
        out_of_hours_weight="UNSAMP_TRAFFIC_WT",
        df_ustotals=unsampled_data,
        min_count_threshold=30)

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], output_data)
    db.insert_dataframe_into_table(config["sas_ps_table"], summary_data)

    # Update Survey Data With Unsampled Wt Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With Unsampled Wt Results
    idm.store_survey_data_with_step_results(run_id, config)

    # Store Unsampled Weight Summary
    idm.store_step_summary(run_id, config)
Example #11
0
def test_update_survey_data_with_step_results(step_name, temp_table,
                                              results_columns, prefix,
                                              database_connection):
    """
    # This test is parameterised. The values for the arguments of this test function
    # are taken from the parameters specified in pytest.mark.parametrize
    # see https://docs.pytest.org/en/latest/parametrize.html
    """

    # step_config and variables
    step_config = {
        "name": step_name,
        "temp_table": temp_table,
        "results_columns": results_columns
    }

    folder = '/update_survey_data_with_step_results'

    # Cleanse and set up test data/tables
    db.delete_from_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
    sas_survey_subsample_input = pd.read_csv(
        TEST_DATA_DIR + folder + prefix +
        'sas_survey_subsample_test_input.csv',
        dtype=object)
    db.insert_dataframe_into_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE,
                                   sas_survey_subsample_input,
                                   database_connection,
                                   fast=False)

    db.delete_from_table(step_config["temp_table"])
    sas_shift_wt_input = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                                     'temp_table_test_input.csv',
                                     dtype=object)
    db.insert_dataframe_into_table(step_config["temp_table"],
                                   sas_shift_wt_input,
                                   database_connection,
                                   fast=False)

    # Run function
    idm.update_survey_data_with_step_results(database_connection, step_config)

    # Get and format results
    results = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
    results.to_csv(TEST_DATA_DIR + folder + prefix + 'actual_results.csv',
                   index=False)
    results = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                          'actual_results.csv',
                          dtype=object)
    test_results = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                               'expected_results.csv',
                               dtype=object)

    results.sort_values(by=["SERIAL"], inplace=True)
    results.index = range(0, len(results))

    test_results.sort_values(by=["SERIAL"], inplace=True)
    test_results.index = range(0, len(test_results))

    assert_frame_equal(results, test_results, check_dtype=False)

    # Assert temp tables had been cleansed in function
    result = db.get_table_values(step_config['temp_table'])
    assert len(result) == 0
Example #12
0
def test_store_step_summary(database_connection):
    # step_config and variables
    step_config = {
        "ps_table":
        "PS_SHIFT_DATA",
        "sas_ps_table":
        "SAS_PS_SHIFT_DATA",
        "ps_columns": [
            "[RUN_ID]", "[SHIFT_PORT_GRP_PV]", "[ARRIVEDEPART]",
            "[WEEKDAY_END_PV]", "[AM_PM_NIGHT_PV]", "[MIGSI]",
            "[POSS_SHIFT_CROSS]", "[SAMP_SHIFT_CROSS]", "[MIN_SH_WT]",
            "[MEAN_SH_WT]", "[MAX_SH_WT]", "[COUNT_RESPS]", "[SUM_SH_WT]"
        ]
    }
    run_id = 'shift-wt-idm-test'
    folder = '/store_step_summary'

    # Set up test data/tables
    test_ps_data = pd.read_csv(TEST_DATA_DIR + folder +
                               '/shift_wt_sas_ps_shift_data_test_input.csv')
    db.insert_dataframe_into_table(step_config["sas_ps_table"], test_ps_data,
                                   database_connection)

    # Run function return results
    idm.store_step_summary(run_id, database_connection, step_config)
    sql = """
    SELECT * FROM {}
    WHERE RUN_ID = '{}'
    """.format(step_config["ps_table"], run_id)
    results = pd.read_sql(sql, database_connection)
    results.to_csv(TEST_DATA_DIR + folder + '/shift_wt_actual_results.csv',
                   index=False)

    # Get and format results
    results = pd.read_csv(TEST_DATA_DIR + folder +
                          '/shift_wt_actual_results.csv',
                          dtype=object)
    test_results = pd.read_csv(TEST_DATA_DIR + folder +
                               '/shift_wt_expected_results.csv',
                               dtype=object)

    results.sort_values(by=[
        'SHIFT_PORT_GRP_PV', 'ARRIVEDEPART', 'WEEKDAY_END_PV', 'AM_PM_NIGHT_PV'
    ],
                        inplace=True)
    results.index = range(0, len(results))

    test_results.sort_values(by=[
        'SHIFT_PORT_GRP_PV', 'ARRIVEDEPART', 'WEEKDAY_END_PV', 'AM_PM_NIGHT_PV'
    ],
                             inplace=True)
    test_results.index = range(0, len(test_results))

    assert_frame_equal(results, test_results, check_dtype=False)

    # Assert temp tables had been cleansed in function
    results = db.get_table_values(step_config['sas_ps_table'])
    assert len(results) == 0

    # Cleanse test inputs
    db.delete_from_table(step_config['ps_table'], 'RUN_ID', '=', run_id)
Example #13
0
def test_store_survey_data_with_step_results(step_name, nullify_pvs, ps_table,
                                             prefix, database_connection):
    """
    # This test is parameterised. The values for the arguments of this test function
    # are taken from the parameters specified in pytest.mark.parametrize
    # see https://docs.pytest.org/en/latest/parametrize.html
    """

    # step_config and variables
    step_config = {
        "name": step_name,
        "nullify_pvs": nullify_pvs,
        "ps_table": ps_table
    }
    run_id = 'store_survey_data_test'
    folder = '/store_survey_data_with_step_results'
    applicable_ps_tables = [
        "SHIFT_WEIGHT", "NON_RESPONSE", "MINIMUMS_WEIGHT", "TRAFFIC_WEIGHT",
        "UNSAMPLED_WEIGHT", "IMBALANCE_WEIGHT", "FINAL_WEIGHT"
    ]

    # Cleanse and delete test inputs
    db.delete_from_table(idm.SURVEY_SUBSAMPLE_TABLE, 'RUN_ID', '=', run_id)
    db.delete_from_table(step_config['ps_table'], 'RUN_ID', '=', run_id)

    # Set up records in SURVEY_SUBSAMPLE with above run_id
    survey_subsample_input = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                                         'survey_subsample_test_input.csv',
                                         dtype=object)
    db.insert_dataframe_into_table(idm.SURVEY_SUBSAMPLE_TABLE,
                                   survey_subsample_input,
                                   database_connection,
                                   fast=False)

    # Set up records in SAS_SURVEY_SUBSAMPLE with above run_id
    sas_survey_subsample_input = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                                             'sss_test_input.csv',
                                             dtype=object)
    db.insert_dataframe_into_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE,
                                   sas_survey_subsample_input,
                                   database_connection,
                                   fast=False)

    # Set up records in ps_table with above run_id
    if step_name in applicable_ps_tables:
        ps_shift_data_input = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                                          'summary_table_test_input.csv',
                                          dtype=object)
        db.insert_dataframe_into_table(step_config['ps_table'],
                                       ps_shift_data_input,
                                       database_connection,
                                       fast=False)

    # Run function
    idm.store_survey_data_with_step_results(run_id, database_connection,
                                            step_config)

    # Assert tables were cleansed by function
    if step_name in applicable_ps_tables:
        sql = """
            SELECT * FROM {}
            WHERE RUN_ID = '{}'""".format(step_config['ps_table'], run_id)
        cur = database_connection.cursor()
        result = cur.execute(sql).fetchone()
        assert result is None

    result = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
    assert len(result) == 0

    # Retrieve results produced by function
    sql = """
    SELECT * FROM {}
    WHERE RUN_ID = '{}'
    """.format(idm.SURVEY_SUBSAMPLE_TABLE, run_id)
    results = pd.read_sql(sql, database_connection)
    results.to_csv(TEST_DATA_DIR + folder + prefix + 'actual_results.csv',
                   index=False)

    # Get and format results
    results = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                          'actual_results.csv',
                          dtype=object)
    test_results = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                               'expected_result.csv',
                               dtype=object)

    results.sort_values(by=["SERIAL"], inplace=True)
    results.index = range(0, len(results))

    test_results.sort_values(by=["SERIAL"], inplace=True)
    test_results.index = range(0, len(test_results))

    assert_frame_equal(results, test_results, check_dtype=False)
    def test_update_step_data_with_step_pv_output(self, database_connection):
        # step_config and variables
        step_config = {
            "pv_columns2":
            ["[SHIFT_PORT_GRP_PV]", "[WEEKDAY_END_PV]", "[AM_PM_NIGHT_PV]"],
            "pv_table":
            "SAS_SHIFT_PV",
            "data_table":
            "SAS_SHIFT_DATA",
            "temp_table":
            "SAS_SHIFT_WT",
            "sas_ps_table":
            "SAS_PS_SHIFT_DATA"
        }

        # Set up test data/tables
        test_shift_pv_data = pd.read_csv(
            UPDATE_STEP_DATA_WITH_STEP_PV_OUTPUT_PATH +
            'test_shift_pv_data.csv')

        # Get rec_id and amend test dataframe
        rec_id = self.get_rec_id("MAX", step_config["data_table"],
                                 database_connection)
        test_shift_pv_data = self.amend_rec_id(test_shift_pv_data,
                                               rec_id,
                                               ascend=False)

        db.insert_dataframe_into_table(step_config['pv_table'],
                                       test_shift_pv_data, database_connection)

        # run the test function
        idm.update_step_data_with_step_pv_output(database_connection,
                                                 step_config)

        # write the results back to csv, and read the csv back (this solves the data type matching issues)
        results = db.get_table_values(step_config['data_table'])

        temp_output = UPDATE_STEP_DATA_WITH_STEP_PV_OUTPUT_PATH + 'copy_update_step_data_with_step_pv_output.csv'
        results.to_csv(temp_output, index=False)
        results = pd.read_csv(temp_output)

        # get the unique REC_ID of the test_shift_pv_data
        rec_id = test_shift_pv_data["REC_ID"]

        # select all rows with matching updated rec_id
        results_1 = results[results['REC_ID'].isin(rec_id)]

        # create column list of pvs
        cols_temp = [
            item.replace("[", "") for item in step_config['pv_columns2']
        ]
        cols_to_keep = [item.replace("]", "") for item in cols_temp]
        cols_to_keep.insert(0, "REC_ID")

        # keep only the required columns from results_1 and importantly reset index and drop it
        results_2 = results_1[cols_to_keep]
        results_3 = results_2.reset_index(drop=True)

        # sort rows in test_shift_pv_data by REC_ID and importantly reset index and drop it
        sorted_test_shift_pv_data_1 = test_shift_pv_data.sort_values(
            by=['REC_ID'])
        sorted_test_shift_pv_data_2 = sorted_test_shift_pv_data_1.reset_index(
            drop=True)

        # check that the two dataframes match
        assert_frame_equal(results_3,
                           sorted_test_shift_pv_data_2,
                           check_names=False,
                           check_like=True,
                           check_dtype=False)

        # Assert temp tables had been cleanse in function
        results = db.get_table_values(step_config['pv_table'])
        assert len(results) == 0

        results = db.get_table_values(step_config['temp_table'])
        assert len(results) == 0

        results = db.get_table_values(idm.SAS_PROCESS_VARIABLES_TABLE)
        assert len(results) == 0

        results = db.get_table_values(step_config['sas_ps_table'])
        assert len(results) == 0
    def test_update_survey_data_with_step_pv_output_with_name_minimums_weight(
            self, database_connection):
        step_config = {
            'name':
            "MINIMUMS_WEIGHT",
            'spv_table':
            'SAS_MINIMUMS_SPV',
            "pv_columns": [
                "'MINS_FLAG_PV'", "'MINS_PORT_GRP_PV'", "'MINS_CTRY_GRP_PV'",
                "'MINS_NAT_GRP_PV'", "'MINS_CTRY_PORT_GRP_PV'"
            ],
            "temp_table":
            "SAS_MINIMUMS_WT",
            "sas_ps_table":
            "SAS_PS_MINIMUMS",
        }

        run_id = 'update-survey-pvs'

        # delete the data in the table so that we have no data in table for test
        db.delete_from_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
        db.delete_from_table(step_config['spv_table'])

        # read and insert into the database the survey data
        test_survey_data = pd.read_pickle(STEP_PV_OUTPUT_PATH +
                                          'update_survey_data_pvs.pkl')
        db.insert_dataframe_into_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE,
                                       test_survey_data, database_connection)

        # read and insert into the database the pvs
        test_nr_pv_data = pd.read_csv(STEP_PV_OUTPUT_PATH +
                                      'test_mw_pv_data.csv')
        db.insert_dataframe_into_table(step_config['spv_table'],
                                       test_nr_pv_data, database_connection)

        # call the test function
        idm.update_survey_data_with_step_pv_output(database_connection,
                                                   step_config)

        # get the newly updated table data write the results back to csv to read back and resolve formatting
        results = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

        # write the results back to csv, and read the csv back (this solves the data type matching issues)
        temp_output = STEP_PV_OUTPUT_PATH + 'update_survey_data_pvs_result_results.csv'
        results.to_csv(temp_output, index=False)
        results = pd.read_csv(temp_output)

        # remove the temporary written file
        os.remove(temp_output)

        # clean test data before actually testing results
        db.delete_from_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
        db.delete_from_table(step_config['spv_table'])

        # check ONLY updated pv columns are as expected in results, check NaN values are handled correctly
        stripped_pv_cols = [
            item.replace("'", "") for item in step_config['pv_columns']
        ]
        stripped_pv_cols.insert(0, 'SERIAL')  # add the SERIAL column
        test_dummy_1 = results[stripped_pv_cols]

        # get the SERIAL column values as a list, and select rows from updated data that match input data
        serials = test_nr_pv_data['SERIAL']
        test_dummy_2 = test_dummy_1[test_dummy_1['SERIAL'].isin(serials)]

        # clean test data before actually testing results
        db.delete_from_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
        db.delete_from_table(step_config['spv_table'])

        # check updated pv columns match the corresponding dummy values
        assert_frame_equal(test_dummy_2,
                           test_nr_pv_data,
                           check_dtype=False,
                           check_like=True)

        # check that the non-pv column values are still the same by dropping pv columns
        columns_to_drop = [
            item.replace("'", "") for item in step_config['pv_columns']
        ]
        new_res = results.drop(columns_to_drop, axis=1)
        new_test_res = test_survey_data.drop(columns_to_drop, axis=1)

        assert_frame_equal(new_res,
                           new_test_res,
                           check_dtype=False,
                           check_like=True)

        # check that spv_table has been deleted
        results_2 = db.get_table_values(step_config['spv_table'])
        assert len(results_2) == 0

        results = db.get_table_values(idm.SAS_PROCESS_VARIABLES_TABLE)
        assert len(results) == 0

        results = db.get_table_values(step_config["temp_table"])
        assert len(results) == 0

        results = db.get_table_values(step_config["sas_ps_table"])
        assert len(results) == 0
def setup_pv():
    df = db.select_data('*', "PROCESS_VARIABLE_PY", 'RUN_ID', 'TEMPLATE')
    df['RUN_ID'] = run_id
    db.insert_dataframe_into_table('PROCESS_VARIABLE_PY', df)
def test_populate_survey_data(name, delete_tables, nullify_pvs,
                              database_connection):
    # This test is parameterised. The values for the arguments of this test function
    # are taken from the parameters specified in pytest.mark.parametrize

    # Delete existing survey data from table where RUN_ID matches our test id
    db.delete_from_table(SURVEY_SUBSAMPLE_TABLE, 'RUN_ID', '=',
                         '9e5c1872-3f8e-4ae5-85dc-c67a602d011e')

    # Read the test data in from a csv file
    test_data = pd.read_csv(TEST_DATA_DIR +
                            "populate_survey_data/survey_subsample.csv",
                            dtype=object)

    # Insert the test data into survey_subsample table
    db.insert_dataframe_into_table(idm.SURVEY_SUBSAMPLE_TABLE, test_data)

    # Setup step configuration
    step_config = {
        'nullify_pvs': nullify_pvs,
        'name': name,
        'delete_tables': delete_tables
    }

    # Run test function
    idm.populate_survey_data_for_step(
        run_id='9e5c1872-3f8e-4ae5-85dc-c67a602d011e',
        conn=database_connection,
        step_configuration=step_config)

    # Get test_result from sas_survey_subsample table
    test_result = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    # Write the test results to a csv
    test_result.to_csv(TEST_DATA_DIR + "populate_survey_data/test_result.csv",
                       index=False)

    # Import the expected result (this result varies if the TRAFFIC_WEIGHT or UNSAMPLED_WEIGHT step is being tested)
    if name == 'TRAFFIC_WEIGHT' or name == 'UNSAMPLED_WEIGHT':
        expected_result = pd.read_csv(
            TEST_DATA_DIR +
            "populate_survey_data/populate_result_traffic_unsampled.csv")
    else:
        expected_result = pd.read_csv(
            TEST_DATA_DIR + "populate_survey_data/populate_result.csv")

    # Import the test result
    test_result = pd.read_csv(TEST_DATA_DIR +
                              "populate_survey_data/test_result.csv")

    # Sort the values by SERIAL
    expected_result = expected_result.sort_values(by='SERIAL')
    test_result = test_result.sort_values(by='SERIAL')

    # Reset the dataframe's indexes so correct rows are compared
    expected_result.index = range(0, len(expected_result))
    test_result.index = range(0, len(test_result))

    # Check all deleted tables are empty
    for table in step_config['delete_tables']:
        delete_result = db.get_table_values(table)
        assert delete_result.empty

    # Check all nullified columns are NULL
    for column in step_config['nullify_pvs']:
        column_name = column.replace('[', '').replace(']', '')
        result = db.select_data(column_name, idm.SURVEY_SUBSAMPLE_TABLE,
                                'RUN_ID',
                                "9e5c1872-3f8e-4ae5-85dc-c67a602d011e")
        assert result[column_name].isnull().sum() == len(result)

    # Check results match
    assert_frame_equal(expected_result,
                       test_result,
                       check_dtype=False,
                       check_like=True)
def test_populate_step_data(table_name, data_table, insert_to_populate,
                            step_data, sas_step_data, result_data,
                            database_connection):
    # This test is parameterised. The values for the arguments of this test function
    # are taken from the parameters specified in pytest.mark.parametrize

    run_id = '9e5c1872-3f8e-4ae5-85dc-c67a602d011e'

    # Setup step configuration
    step_config = {
        "table_name": table_name,
        "data_table": data_table,
        "insert_to_populate": insert_to_populate,
    }

    # Clear existing test records from the shift_data table
    db.delete_from_table(step_config['table_name'], 'RUN_ID', '=',
                         '9e5c1872-3f8e-4ae5-85dc-c67a602d011e')

    # Get test data from file
    test_data = pd.read_csv(TEST_DATA_DIR + "populate_step_data/" + step_data,
                            dtype=object)

    # Insert test data into table
    db.insert_dataframe_into_table(step_config["table_name"], test_data)

    # Run XML step which deletes old data from sas_survey_subsample and repopulates it with the new data
    idm.populate_step_data(run_id, database_connection, step_config)

    # Get test_result from (sas) external data table
    test_result = db.get_table_values(step_config['data_table'])

    # Write the test results to a csv
    test_result.to_csv(TEST_DATA_DIR + "populate_step_data/" + result_data,
                       index=False)

    # Import both the expected result and test result from the csv files
    expected_result = pd.read_csv(TEST_DATA_DIR + "populate_step_data/" +
                                  sas_step_data)
    test_result = pd.read_csv(TEST_DATA_DIR + "populate_step_data/" +
                              result_data)

    # Nullify the rec_id for comparison (this needs to be done because the expected result contains no rec_id)
    expected_result['REC_ID'] = ''
    test_result['REC_ID'] = ''

    # Sort records to match order
    if table_name == 'SHIFT_DATA':
        expected_result = expected_result.sort_values(
            by=['PORTROUTE', 'WEEKDAY'])
        test_result = test_result.sort_values(by=['PORTROUTE', 'WEEKDAY'])
    elif table_name == 'NON_RESPONSE_DATA':
        expected_result = expected_result.sort_values(by=[
            'PORTROUTE', 'WEEKDAY', 'ARRIVEDEPART', 'AM_PM_NIGHT',
            'SAMPINTERVAL', 'MIGTOTAL', 'ORDTOTAL'
        ])
        test_result = test_result.sort_values(by=[
            'PORTROUTE', 'WEEKDAY', 'ARRIVEDEPART', 'AM_PM_NIGHT',
            'SAMPINTERVAL', 'MIGTOTAL', 'ORDTOTAL'
        ])
    elif table_name == 'UNSAMPLED_OOH_DATA':
        expected_result = expected_result.sort_values(
            by=['PORTROUTE', 'REGION', 'ARRIVEDEPART', 'UNSAMP_TOTAL'])
        test_result = test_result.sort_values(
            by=['PORTROUTE', 'REGION', 'ARRIVEDEPART', 'UNSAMP_TOTAL'])
    elif table_name == 'TRAFFIC_DATA':
        expected_result = expected_result.sort_values(
            by=['PORTROUTE', 'ARRIVEDEPART', 'TRAFFICTOTAL', 'HAUL'])
        test_result = test_result.sort_values(
            by=['PORTROUTE', 'ARRIVEDEPART', 'TRAFFICTOTAL', 'HAUL'])

    # Reset the dataframe's indexes so correct rows are compared
    expected_result.index = range(0, len(expected_result))
    test_result.index = range(0, len(test_result))

    # Check results match
    assert_frame_equal(expected_result,
                       test_result,
                       check_dtype=False,
                       check_like=True)
Example #19
0
def non_response_weight_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 26 April 2018 / 2 October 2018
    Purpose      : Runs the non response weight steps of the ips process
    Params       : run_id - the id for the current run.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_non_response()

    # Populate Survey Data For Non Response Wt
    idm.populate_survey_data_for_step(run_id, config)

    # Populate Non Response Data
    idm.populate_step_data(run_id, config)

    # Copy Non Response Wt PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply Non Response Wt PVs On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_NON_RESPONSE_SPV',
                              in_id='serial')

    # Update Survey Data with Non Response Wt PVs Output
    idm.update_survey_data_with_step_pv_output(config)

    # Copy Non Response Wt PVs for Non Response Data
    idm.copy_step_pvs_for_step_data(run_id, config)

    # Apply Non Response Wt PVs On Non Response Data
    process_variables.process(dataset='non_response',
                              in_table_name='SAS_NON_RESPONSE_DATA',
                              out_table_name='SAS_NON_RESPONSE_PV',
                              in_id='REC_ID')

    # Update NonResponse Data With PVs Output
    idm.update_step_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    non_response_data = db.get_table_values(config["data_table"])

    # Calculate Non Response Weight
    survey_data_out, summary_data_out = \
        calculate_nonresponse_weight.do_ips_nrweight_calculation(survey_data,
                                                                 non_response_data,
                                                                     'NON_RESPONSE_WT',
                                                                     'SERIAL')

    db.insert_dataframe_into_table(config["temp_table"], survey_data_out)
    db.insert_dataframe_into_table(config["sas_ps_table"], summary_data_out)

    # Update Survey Data With Non Response Wt Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With NonResponse Wt Results
    idm.store_survey_data_with_step_results(run_id, config)

    # Store Non Response Wt Summary
    idm.store_step_summary(run_id, config)
def shift_weight_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 26 April 2018 / 2 October 2018
    Purpose      : Runs the shift weight steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_shift_weight()

    # Populate Survey Data For Shift Wt
    idm.populate_survey_data_for_step(run_id, config)

    # Populate Shift Data
    idm.populate_step_data(run_id, config)

    # Copy Shift Wt PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply Shift Wt PVs On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_SHIFT_SPV',
                              in_id='serial')

    # Update Survey Data with Shift Wt PV Output
    idm.update_survey_data_with_step_pv_output(config)

    # Copy Shift Wt PVs For Shift Data
    idm.copy_step_pvs_for_step_data(run_id, config)

    # Apply Shift Wt PVs On Shift Data
    process_variables.process(dataset='shift',
                              in_table_name='SAS_SHIFT_DATA',
                              out_table_name='SAS_SHIFT_PV',
                              in_id='REC_ID')

    # Update Shift Data with PVs Output
    idm.update_step_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
    shift_data = db.get_table_values(config["data_table"])

    # shift_data = sas_shift_schema.convert_dtype(shift_data)

    # Calculate Shift Weight
    survey_data_out, summary_data_out = \
        calculate_shift_weight.do_ips_shift_weight_calculation(survey_data,
                                                               shift_data,
                                                               serial_number='SERIAL',
                                                               shift_weight='SHIFT_WT')

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], survey_data_out)
    db.insert_dataframe_into_table(config["sas_ps_table"], summary_data_out)

    # Update Survey Data With Shift Wt Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With Shift Wt Results
    idm.store_survey_data_with_step_results(run_id, config)

    # Store Shift Wt Summary
    idm.store_step_summary(run_id, config)
Example #21
0
def r_population_input(survey_input: pd.DataFrame,
                       ustotals: pd.DataFrame) -> None:
    """
    Author       : David Powell
    Date         : 07/06/2018
    Purpose      : Creates population data that feeds into the R GES weighting
    Parameters   : df_survey_input - A data frame containing the survey data for
                   processing month
                   trtotals - A data frame containing population information for
                   processing year
    Returns      : A data frame containing the information needed for GES weighting
    Requirements : NA
    Dependencies : NA
    """

    df_survey_input = survey_input
    df_us_totals = ustotals

    sort1 = ['UNSAMP_PORT_GRP_PV', 'UNSAMP_REGION_GRP_PV', 'ARRIVEDEPART']

    df_survey_input_lookup = df_survey_input.sort_values(sort1)

    # Cleanse data
    df_survey_input_lookup.UNSAMP_REGION_GRP_PV.fillna(value=0, inplace=True)

    df_survey_input_lookup = df_survey_input_lookup[
        ~df_survey_input_lookup['UNSAMP_PORT_GRP_PV'].isnull()]
    df_survey_input_lookup = df_survey_input_lookup[
        ~df_survey_input_lookup['ARRIVEDEPART'].isnull()]

    # Create lookup. Group by and aggregate. Allocates T_1 - T_n.
    lookup_dataframe = df_survey_input_lookup
    lookup_dataframe["count"] = ""
    lookup_dataframe = lookup_dataframe.groupby(
        ['UNSAMP_PORT_GRP_PV', 'UNSAMP_REGION_GRP_PV', 'ARRIVEDEPART']).agg({
            "count":
            'count'
        }).reset_index()

    # Cleanse data
    lookup_dataframe = lookup_dataframe.replace('NOTHING', np.NaN)
    lookup_dataframe.drop(["count"], axis=1)
    lookup_dataframe["T1"] = range(len(lookup_dataframe))
    lookup_dataframe["T1"] = lookup_dataframe["T1"] + 1

    # Create unsampled design weight used within GES weighting
    df_survey_input['SHIFT_WT'] = df_survey_input.SHIFT_WT.astype(np.float)
    df_survey_input = df_survey_input.round({'SHIFT_WT': 3})
    values = df_survey_input.SHIFT_WT * df_survey_input.NON_RESPONSE_WT * df_survey_input.MINS_WT * df_survey_input.TRAFFIC_WT
    df_survey_input['OOHDesignWeight'] = values
    df_survey_input = df_survey_input.sort_values(sort1)

    df_survey_input = df_survey_input[df_survey_input.OOHDesignWeight > 0]
    df_survey_input = df_survey_input.fillna('NOTHING')

    df_prev_totals = df_survey_input.groupby(
        ['UNSAMP_PORT_GRP_PV', 'UNSAMP_REGION_GRP_PV', 'ARRIVEDEPART']).agg({
            "OOHDesignWeight":
            'sum'
        }).reset_index()

    df_prev_totals.rename(columns={'OOHDesignWeight': 'prevtotals'},
                          inplace=True)
    df_prev_totals = df_prev_totals.replace('NOTHING', np.NaN)
    df_prev_totals = df_prev_totals.sort_values(sort1)

    sort1 = ['UNSAMP_PORT_GRP_PV', 'UNSAMP_REGION_GRP_PV', 'ARRIVEDEPART']
    df_us_totals = df_us_totals.sort_values(sort1)
    df_us_totals = df_us_totals.fillna('NOTHING')

    df_pop_totals = df_us_totals.groupby(
        ['UNSAMP_PORT_GRP_PV', 'UNSAMP_REGION_GRP_PV', 'ARRIVEDEPART']).agg({
            "UNSAMP_TOTAL":
            'sum'
        }).reset_index()

    df_pop_totals.rename(columns={'UNSAMP_TOTAL': 'uplift'}, inplace=True)
    df_pop_totals = df_pop_totals.replace('NOTHING', np.NaN)
    df_pop_totals = df_pop_totals.sort_values(sort1)

    df_pop_totals = df_pop_totals.fillna('NOTHING')
    df_prev_totals = df_prev_totals.fillna('NOTHING')

    # Merge populations totals to create one dataframe lookup
    df_lifted_totals = pd.merge(
        df_prev_totals,
        df_pop_totals,
        on=['UNSAMP_PORT_GRP_PV', 'UNSAMP_REGION_GRP_PV', 'ARRIVEDEPART'],
        how='left')

    df_lifted_totals = df_lifted_totals.replace('NOTHING', np.NaN)
    df_lifted_totals['uplift'] = df_lifted_totals['uplift'].fillna(0)
    df_lifted_totals = df_lifted_totals.fillna(0)

    values = df_lifted_totals.prevtotals + df_lifted_totals.uplift
    df_lifted_totals['UNSAMP_TOTAL'] = values

    df_mod_totals = pd.merge(
        df_lifted_totals,
        lookup_dataframe,
        on=['UNSAMP_PORT_GRP_PV', 'UNSAMP_REGION_GRP_PV', 'ARRIVEDEPART'],
        how='left')

    df_mod_totals['C_group'] = 1
    df_mod_totals = df_mod_totals.drop(
        ['ARRIVEDEPART', 'UNSAMP_PORT_GRP_PV', 'UNSAMP_REGION_GRP_PV'], axis=1)

    # # ROUND VALUES - Added to match SAS output
    df_mod_totals = df_mod_totals.pivot_table(index='C_group',
                                              columns='T1',
                                              values='UNSAMP_TOTAL')

    df_mod_totals = df_mod_totals.add_prefix('T_')

    db.insert_dataframe_into_table('poprowvec_unsamp',
                                   df_mod_totals,
                                   if_exists='replace')
Example #22
0
 def insert(d: pd.DataFrame):
     db.insert_dataframe_into_table(table, d, if_exists)