Exemple #1
0
def import_non_response(file_name, file_type, run_id):

    data_schema = non_response_schema.get_schema()
    # Convert CSV to dataframe and stage
    dataframe = pd.read_csv(file_name, engine="python", dtype=data_schema)

    dataframe.columns = dataframe.columns.str.upper()
    dataframe.columns = dataframe.columns.str.replace(' ', '')
    dataframe["RUN_ID"] = run_id
    dataframe.rename(columns={"DATASOURCE": "DATA_SOURCE_ID"}, inplace=True)

    datasource_id = file_type.value

    datasource_id = datasource_id
    dataframe['DATA_SOURCE_ID'].replace(['Non Response'],
                                        datasource_id,
                                        inplace=True)

    sql = f"""
            DELETE FROM NON_RESPONSE_DATA
            WHERE RUN_ID = '{run_id}'
            """

    try:
        db.execute_sql_statement(sql)
        db.insert_dataframe_into_table('NON_RESPONSE_DATA', dataframe)
    except Exception as err:
        log.error(f"Cannot insert non_response dataframe into table: {err}")
        return None
def insert_dataframe_into_table(table_name: str,
                                dataframe: pandas.DataFrame,
                                if_exists='append') -> None:
    """
    Author       : Thomas Mahoney
    Date         : 02 Jan 2018
    Purpose      : Inserts a full dataframe into a SQL table
    Params       : table_name - the name of the target table in the sql database.
                   dataframe - the dataframe to be added to the selected table.
    Returns      : The number of rows added to the database.
    Requirements : NA
    Dependencies : NA
    """

    dataframe = dataframe.where((pandas.notnull(dataframe)), None)
    dataframe.columns = dataframe.columns.astype(str)

    try:
        dataframe.to_sql(table_name,
                         con=connection_string,
                         if_exists=if_exists,
                         chunksize=5000,
                         index=False)
    except Exception as err:
        log.error(f"insert_dataframe_into_table failed: {err}")
        raise err
    def on_put(self, req: Request, resp: Response, run_id: str) -> None:
        # Start a run

        if self.workflow.in_progress():
            error = f"Can only run one instance of a workflow at a time, {run_id} rejected."
            log.error(error)
            raise falcon.HTTPError(falcon.HTTP_403, 'Concurrency Error', error)

        log.info("Starting calculations for RUN_ID: " + run_id)

        try:
            if not db.is_valid_run_id(run_id):
                result = {'status': "invalid job id: " + run_id}
                resp.status = falcon.HTTP_401
                resp.body = json.dumps(result)
                return

            thr = threading.Thread(target=self.workflow.run_calculations, args=(run_id,))

            thr.start()

            log.info(f"started job: {run_id}")

            result = {'status': "started job: " + run_id}
            resp.body = json.dumps(result)

        except ValueError:
            raise falcon.HTTPError(falcon.HTTP_400, 'Invalid JSON',
                                   'Could not decode the request body. The JSON was invalid.')
def get_sql_connection():
    """
    Author       : Thomas Mahoney / Nassir Mohammad (edits)
    Date         : 11 / 07 / 2018
    Purpose      : Establishes a connection to the SQL Server database and returns the connection object.
    Parameters   : in_table_name - the IPS survey records for the period.
                   credentials_file  - file containing the server and login credentials used for connection.
    Returns      : a pyodbc connection object.
    Requirements : NA
    Dependencies : NA
    """

    global eng

    if eng is not None:
        return eng

    # Get credentials and decrypt

    try:
        engi = sqlalchemy.create_engine(connection_string)
        eng = engi
        return engi
    except Exception as err:
        log.error(f"get_sql_connection failed: {err}")
        raise err
def import_unsampled(file_name, file_type, run_id):

    data_schema = unsampled_schema.get_schema()
    # Convert CSV to dataframe and stage
    dataframe = pd.read_csv(file_name, engine="python", dtype=data_schema)

    dataframe.columns = dataframe.columns.str.upper()
    dataframe.columns = dataframe.columns.str.replace(' ', '')
    dataframe["RUN_ID"] = run_id
    dataframe.rename(columns={"DATASOURCE": "DATA_SOURCE_ID"}, inplace=True)

    # replace "REGION" values with 0 if not an expected value
    dataframe['REGION'].replace(['None', "", ".", 'nan'], 0, inplace=True)

    datasource_id = file_type.value

    datasource_id = datasource_id
    dataframe['DATA_SOURCE_ID'].replace(['Unsampled'],
                                        datasource_id,
                                        inplace=True)

    sql = f"DELETE FROM UNSAMPLED_OOH_DATA WHERE RUN_ID = '{run_id}'"

    try:
        db.execute_sql_statement(sql)
        db.insert_dataframe_into_table('UNSAMPLED_OOH_DATA', dataframe)
    except Exception as err:
        log.error(
            f"Cannot insert unsampled_data dataframe into database: {err}")
        return None
def execute_sql_statement(sq):
    try:
        conn = get_sql_connection()
        conn.execute(sq)
    except Exception as err:
        log.error(f"execute_sql_statement failed: {err}")
        raise err
Exemple #7
0
    def wrapper(*args, **kwargs):
        try:
            log.debug(f"Calling service: {func.__name__}")
            return func(*args, **kwargs)

        except Exception as err:
            error = f'Error calling service {func.__name__}. Error: ' + str(
                err)
            log.error(error)
            raise falcon.HTTPError(falcon.HTTP_400, 'service error', error)
Exemple #8
0
    def on_post(self, req: Request, resp: Response) -> None:
        data = self.load_json_from_request(req)

        if 'RUN_ID' not in data:
            error = f"No JSON payload or run_id not stipulated in payload."
            log.error(error)
            raise falcon.HTTPError(
                falcon.HTTP_400, 'Invalid request',
                'Could not decode the request body. The JSON was invalid.')
        create_run(data)
        resp.status = falcon.HTTP_201
Exemple #9
0
    def on_post(self, req: Request, resp: Response, run_id: str) -> None:
        data = self.load_json_from_request(req)

        if 'RUN_ID' not in data:
            error = f"No JSON payload or run_id not stipulated."
            log.error(error)
            raise falcon.HTTPError(
                falcon.HTTP_400, 'Invalid request',
                'Could not decode the request body. The JSON was invalid.')

        return create_process_variables(data, run_id)
def update_survey_data_with_step_results(step_configuration):
    """
    Author       : Elinor Thorne
    Date         : May 2018
    Purpose      : Updates survey data with the results
    Parameters   : conn - connection object pointing at the database
                 : step -
    Returns      : NA
    """

    valid_steps = [
        "SHIFT_WEIGHT", "NON_RESPONSE", "MINIMUMS_WEIGHT", "TRAFFIC_WEIGHT",
        "UNSAMPLED_WEIGHT", "FINAL_WEIGHT", "IMBALANCE_WEIGHT",
        "FARES_IMPUTATION", "REGIONAL_WEIGHTS", "TOWN_AND_STAY_EXPENDITURE",
        "RAIL_IMPUTATION", "STAY_IMPUTATION", "SPEND_IMPUTATION", "AIR_MILES"
    ]

    step = step_configuration["name"]

    if step not in valid_steps:
        log.error(
            "Invalid step in update_survey_data_with_step_results: likely a configuration error"
        )
        raise NameError("Invalid step")

    table = step_configuration["temp_table"]
    results_columns = step_configuration["results_columns"]

    dispatcher = {
        "SHIFT_WEIGHT": update_green,
        "NON_RESPONSE": update_green,
        "MINIMUMS_WEIGHT": update_green,
        "TRAFFIC_WEIGHT": update_green,
        "UNSAMPLED_WEIGHT": update_green,
        "FINAL_WEIGHT": update_green,
        "FARES_IMPUTATION": update_green,
        "REGIONAL_WEIGHTS": update_green,
        "TOWN_AND_STAY_EXPENDITURE": update_green,
        "AIR_MILES": update_green,
        "IMBALANCE_WEIGHT": update_imbalance_weights,
        "STAY_IMPUTATION": update_stay_imputation,
        "SPEND_IMPUTATION": update_spend_imputation
    }

    update_step_results = dispatcher.get(step)

    if update_step_results is not None:
        update_step_results(table, results_columns)
    else:
        update_others(table)

    db.delete_from_table(table)
    def load_json_from_request(req: Request) -> str:

        try:
            data = json.load(req.bounded_stream)
            if data is None:
                error = f"No data. The request was empty"
                log.error(error)
                raise falcon.HTTPError(falcon.HTTP_400, 'Invalid JSON', error)
            return data

        except ValueError:
            error = f"Could not decode the request body. The JSON was invalid."
            log.error(error)
            raise falcon.HTTPError(falcon.HTTP_400, 'Invalid JSON', error)
Exemple #12
0
def get_run():

    data = get_runs()

    if data.empty:
        error = f"No run data"
        log.error(error)
        raise falcon.HTTPError(falcon.HTTP_400, 'Data Error', error)

    try:
        return data.to_json(orient='records')
    except ValueError:
        error = f"Could not decode the request body. The JSON was invalid."
        log.error(error)
        raise falcon.HTTPError(
            falcon.HTTP_400, 'Invalid JSON',
            'Could not decode the request body. The JSON was invalid.')
def get_table_values(table_name: str) -> pandas.DataFrame:
    """
    Author       : Thomas Mahoney
    Date         : 02 Jan 2018
    Purpose      : Extracts a full table into a pandas dataframe
    Params       : table_name - the name of the target table in the sql database.
    Returns      : Dataframe containing the extracted table data.
    Requirements : NA
    Dependencies : NA
    """

    try:
        return pandas.read_sql_table(table_name=table_name,
                                     con=connection_string)
    except Exception as err:
        log.error(f"get_table_values failed: {err}")
        raise err
Exemple #14
0
def login(user_name: str, password: str) -> None:

    data = get_users()

    user_credentials = data.loc[data['username'] == user_name]

    if user_credentials.empty:
        error = f"User, {user_name}, not found."
        log.error(error)
        raise falcon.HTTPError(falcon.HTTP_404, 'login error', error)

    password = b64decode(password.encode('ascii')).decode('ascii')

    if not check_password_hash(user_credentials['password'].values[0],
                               password):
        error = f"Invalid password."
        log.error(error)
        raise falcon.HTTPError(falcon.HTTP_401, 'login error', error)
def delete_from_table(table_name: str,
                      condition1: str = None,
                      operator: str = None,
                      condition2: str = None,
                      condition3: str = None) -> None:
    """
    Author         : Elinor Thorne
    Date           : 7 Dec 2017
    Purpose        : Generic SQL query to delete contents of table
    Parameters     : table_name - name of table
                     condition1 - first condition / value
                     operator - comparison operator i.e
                     '=' Equal
                     '!=' Not Equal
                     '>' Greater than
                     '>=' Greater than or equal, etc
                     https://www.techonthenet.com/oracle/comparison_operators.php
                     condition2 - second condition / value
                     condition3 - third condition / value used for BETWEEN
                     ranges, i.e: "DELETE FROM table_name WHERE condition1
                     BETWEEN condition2 AND condition3"
    Returns         : True/False (bool)
    Requirements    : None
    Dependencies    : check_table(),
                      get_sql_connection,
    """

    if condition1 is None:
        query = ("DELETE FROM " + table_name)
    elif condition3 is None:
        query = ("DELETE FROM " + table_name + " WHERE " + condition1 + " " +
                 operator + " '" + condition2 + "'")
    else:
        query = ("DELETE FROM " + table_name + " WHERE " + condition1 + " " +
                 operator + " '" + condition2 + "'" + " AND " + condition3)

    try:
        conn = get_sql_connection()
        conn.execute(query)
    except Exception as err:
        traceback.print_exc()
        log.error(f"delete_from_table failed: {err}")
        raise err
def drop_table(table_name: str) -> None:
    """
    Author        : Elinor Thorne
    Date          : 7 Dec 2017
    Purpose       : Generic SQL query to drop table
    Parameters    : table_name - name of table to drop
    Returns       : True/False (bool)
    Requirements  : None
    Dependencies  : check_table()
                  : get_sql_connection()
                  : database_logger()
    """

    try:
        conn = get_sql_connection()
        conn.execute("DROP TABLE IF EXISTS " + table_name)
    except Exception as err:
        log.error(f"drop_table failed: {err}")
        raise err
def clear_memory_table(table_name: str) -> None:
    """
    Author        : Elinor Thorne
    Date          : 7 Dec 2017
    Purpose       : Generic SQL query to drop table
    Parameters    : table_name - name of table to drop
    Returns       : True/False (bool)
    Requirements  : None
    Dependencies  : check_table()
                  : get_sql_connection()
                  : database_logger()
    """

    try:
        conn = get_sql_connection()
        conn.execute(f"ALTER TABLE {table_name} ENGINE=MEMORY")
    except Exception as err:
        log.error(f"Clear memory_table failed: {err}")
        raise err
def select_data(column_name: str, table_name: str, condition1: str,
                condition2: str) -> Optional[pandas.DataFrame]:
    """
    Author        : Elinor Thorne
    Date          : 21 Dec 2017
    Purpose       : Uses SQL query to retrieve values from database
    Parameters    : column_name, table_name, condition1, condition2, i.e:
                  : "SELECT column_name FROM table_name WHERE condition1 = condition2" (no 'AND'/'OR' clause)
    Returns       : Data Frame for multiple values, scalar/string for single values
    Requirements  : None
    """

    query = f"""
        SELECT {column_name} 
        FROM {table_name}
        WHERE {condition1} = '{condition2}'
        """

    try:
        return pandas.read_sql_query(query, con=connection_string)
    except Exception as err:
        log.error(f"select_data failed: {err}")
        raise err
def get_process_variables(run_id=None):
    data = get_pv()

    data['PV_DEF'] = data['PV_DEF'].str.replace('<',
                                                '<').str.replace('&gt;', '>')

    if data.empty:
        error = f"PROCESS_VARIABLES table is empty."
        log.error(error)
        raise falcon.HTTPError(falcon.HTTP_400, 'Data Error', error)

    if run_id:
        data = data.loc[data['RUN_ID'] == run_id]

        if data.empty:
            error = f"Run id, {run_id}, is not in the PROCESS_VARIABLES table."
            log.error(error)
            raise falcon.HTTPError(falcon.HTTP_400, 'Data Error', error)

    data.sort_values('PROCESS_VARIABLE_ID', inplace=True)
    data.index = range(0, len(data))
    output = data.to_json(orient='records')

    return output
Exemple #20
0
def modify_values(row, pvs, dataset):
    """
    Author       : Thomas Mahoney
    Date         : 27 / 03 / 2018
    Purpose      : Applies the PV rules to the specified dataframe on a row by row basis.
    Parameters   : row - the row of a dataframe passed to the function through the 'apply' statement called
                   pvs - a collection of pv names and statements to be applied to the dataframe's rows.
                   dataset -  and identifier used in the executed pv statements.
    Returns      : a modified row to be reinserted into the dataframe.
    Requirements : this function must be called through a pandas apply statement.
    Dependencies : NA
    """

    for pv in pvs:
        code = pv[1]
        try:
            exec(code)
        except ValueError:
            log.error(f"ValueError on PV: {pv[0]}, code: {code}")
            raise ValueError

        except KeyError:
            log.error(f"KeyError on PV: {pv[0]}, code: {code}")
            raise KeyError

        except TypeError:
            log.error(f"TypeError on PV: {pv[0]}, code: {code}")
            raise TypeError

        except SyntaxError:
            log.error(f"SyntaxError on PV: {pv[0]}, code: {code}")
            raise SyntaxError

    if dataset in ('survey', 'shift'):
        row['SHIFT_PORT_GRP_PV'] = str(row['SHIFT_PORT_GRP_PV'])[:10]

    return row
def do_ips_shift_weight_calculation(df_surveydata, df_shiftsdata,
                                    serial_number, shift_weight):
    """
    Author       :  Richmond Rice / Nassir Mohammad
    Date         :  May 2018
    Purpose      :  Generates shift weights (design weights/initial weights) for each type
                    of IPS traffic.  Runs the shift factor and crossings factor functions.
                    Uses the data frames they return to calculate the surveydata and summary data sets.
    Parameters   :  Parameters:	df_surveydata = the IPS survey records for the period.
                    df_shiftsdata = SAS data set holding # of possible shifts / total crossings by stratum													|;
                    var_serial_number = Variable holding the record serial number
                    var_shift_weight = Variable holding the name of the shift weight field
    Returns      :  Data frames: (final_output_data, final_summary_data)
    Requirements :  logging
    Dependencies :  Function - calculate_ips_shift_factor()
                    Function - calculate_ips_crossing_factor()
    """

    # Calculate the Shift Factor for the given data sets
    df_totsampshifts, df_possshifts, df_surveydata_sf = calculate_ips_shift_factor(
        df_shiftsdata, df_surveydata)
    # Calculate the Crossings Factor for the given data sets
    df_totsampcrossings, df_surveydata_merge = calculate_ips_crossing_factor(
        df_shiftsdata, df_surveydata_sf)

    # The various column sets used for setting columns, sorting columns,
    # aggregating by, merging data frames.
    colset1 = SHIFTS_STRATA + [MIG_SI_COLUMN]

    colset2 = SHIFTS_STRATA

    colset3 = SHIFTS_SUB_STRATA

    colset4 = SHIFTS_STRATA + [
        MIG_SI_COLUMN, POSSIBLE_COUNT_COLUMN, SAMPLED_COUNT_COLUMN,
        MIN_WEIGHT_COLUMN, AVERAGE_WEIGHT_COLUMN, MAX_WEIGHT_COLUMN,
        COUNT_COLUMN, WEIGHT_SUM_COLUMN
    ]

    colset5 = [serial_number, shift_weight]

    # Make all column headers upper case
    df_surveydata_merge.columns = df_surveydata_merge.columns.str.upper()
    df_possshifts.columns = df_possshifts.columns.str.upper()
    df_totsampcrossings.columns = df_totsampcrossings.columns.str.upper()
    df_totsampshifts.columns = df_totsampshifts.columns.str.upper()

    # --------------------------------------------------------------------
    # Check for any missing shift factors by extracting incorrect values
    # --------------------------------------------------------------------
    df_shift_flag = df_surveydata_merge[df_surveydata_merge[FLAG_COLUMN] == 1]
    df_shift_flag = df_shift_flag[df_shift_flag[FACTOR_COLUMN].isnull()]

    # Collect data outside of specified threshold
    threshold_string = ""
    for index, record in df_shift_flag.iterrows():
        threshold_string += "___||___" \
                            + df_shift_flag.columns[0] + " : " + str(record[0])

    if len(df_shift_flag) > 0:
        log.error('Case(s) contain no shift factor(s):' + threshold_string)
    else:
        df_surveydata_merge.loc[df_surveydata_merge[FACTOR_COLUMN].isnull() &
                                (df_surveydata_merge[FLAG_COLUMN] != 1),
                                FACTOR_COLUMN] = 1
        log.debug('Contains shift factor(s)')

    # --------------------------------------------------------------------
    # Check for missing crossings factor by extracting incorrect values
    # --------------------------------------------------------------------
    df_crossings_flag = df_surveydata_merge[
        df_surveydata_merge[CROSSING_FLAG_COLUMN] == 1]
    df_crossings_flag = df_crossings_flag[
        df_crossings_flag[CROSSING_FACTOR_COLUMN].isnull()]

    # Collect data outside of specified threshold

    if len(df_crossings_flag) > 0:
        threshold_string = ""
        for index, record in df_crossings_flag.iterrows():
            threshold_string += "___||___" \
                                + df_crossings_flag.columns[0] + " : " + str(record[0])
        log.error('Case(s) contain no crossings factor(s):' + threshold_string)
    else:
        df_surveydata_merge.loc[
            df_surveydata_merge[CROSSING_FACTOR_COLUMN].isnull() &
            (df_surveydata_merge.CROSSINGS_FLAG_PV != 1),
            CROSSING_FACTOR_COLUMN] = 1
        log.debug('Contains crossings factor(s)')

    # --------------------------------------------------------------------
    # Check for invalid shift data by extracting incorrect values
    # --------------------------------------------------------------------
    df_invalid_shifts = df_surveydata_merge[
        df_surveydata_merge[FACTOR_COLUMN] < 0]

    df_possible_shifts = pd.merge(df_shift_flag,
                                  df_invalid_shifts,
                                  on=['SERIAL'],
                                  how='left')

    # Collect data outside of specified threshold

    if len(df_possible_shifts) > 0:
        threshold_string = ""
        for index, record in df_possible_shifts.iterrows():
            threshold_string += "___||___" \
                                + df_possible_shifts.columns[0] + " : " + str(record[0])
        log.error('Case(s) has an invalid number of possible shifts' +
                  threshold_string)

    # Check for invalid crossings data by extracting incorrect values.
    df_invalid_crossings = df_surveydata_merge[
        df_surveydata_merge[CROSSING_FACTOR_COLUMN] < 0]

    df_possible_crossings = pd.merge(df_crossings_flag,
                                     df_invalid_crossings,
                                     on=['SERIAL'],
                                     how='left')

    # Collect data outside of specified threshold

    if len(df_possible_crossings) > 0:
        threshold_string = ""
        for index, record in df_possible_crossings.iterrows():
            threshold_string += "___||___" \
                                + df_possible_crossings.columns[0] + " : " + str(record[0])
        log.error('Case(s) has an invalid number of total crossings' +
                  threshold_string)

    # Check for missing migration sampling intervals by extracting incorrect values.
    df_missing_migsi = df_surveydata_merge[
        df_surveydata_merge['MIGSI'].isnull()]

    # Collect data outside of specified threshold

    if len(df_missing_migsi) > 0:
        threshold_string = ""
        for index, record in df_missing_migsi.iterrows():
            threshold_string += "___||___" \
                                + df_missing_migsi.columns[0] + " : " + str(record[0])
        log.error('Case(s) missing migration sampling interval' +
                  threshold_string)

    # --------------------------------------------------------------------
    # Calculate shift weight: PS - add round to match expected in test?
    # --------------------------------------------------------------------

    df_surveydata_merge[shift_weight] = df_surveydata_merge[
        FACTOR_COLUMN] * df_surveydata_merge[
            CROSSING_FACTOR_COLUMN] * df_surveydata_merge[MIG_SI_COLUMN]

    # df_surveydata_merge[shift_weight] = round(
    #     df_surveydata_merge[FACTOR_COLUMN] * df_surveydata_merge[CROSSING_FACTOR_COLUMN] * df_surveydata_merge[
    #         MIG_SI_COLUMN], 3)

    # --------------------------------------------------------------------
    # produce shift weight summary output
    # --------------------------------------------------------------------

    # Sort surveydata
    df_surveydata_merge_sorted = df_surveydata_merge.sort_values(colset1)

    # Group by the necessary columns and aggregate df_surveydata_merge shift weight
    df_surveydata_merge_sorted_grouped = \
        df_surveydata_merge_sorted.groupby(SHIFTS_STRATA + [MIG_SI_COLUMN])[shift_weight].agg({
            COUNT_COLUMN: 'count',
            WEIGHT_SUM_COLUMN: 'sum',
            MIN_WEIGHT_COLUMN: 'min',
            AVERAGE_WEIGHT_COLUMN: 'mean',
            MAX_WEIGHT_COLUMN: 'max'
        })

    # Flatten summary columns to single row after aggregation
    df_surveydata_merge_sorted_grouped = df_surveydata_merge_sorted_grouped.reset_index(
    )

    # PS: round column
    df_surveydata_merge_sorted_grouped[WEIGHT_SUM_COLUMN] = \
        df_surveydata_merge_sorted_grouped[WEIGHT_SUM_COLUMN].round(3)
    df_surveydata_merge_sorted_grouped[MIN_WEIGHT_COLUMN] = \
        df_surveydata_merge_sorted_grouped[MIN_WEIGHT_COLUMN].round(3)
    df_surveydata_merge_sorted_grouped[AVERAGE_WEIGHT_COLUMN] = \
        df_surveydata_merge_sorted_grouped[AVERAGE_WEIGHT_COLUMN].round(3)
    df_surveydata_merge_sorted_grouped[MAX_WEIGHT_COLUMN] = \
        df_surveydata_merge_sorted_grouped[MAX_WEIGHT_COLUMN].round(3)

    # --------------------------------------------------------------------
    # Merge possible shifts to summary
    # --------------------------------------------------------------------

    # Merge possible shifts to summary
    df_summary = pd.merge(df_surveydata_merge_sorted_grouped,
                          df_possshifts,
                          on=colset2,
                          how='outer')
    df_summary = df_summary.rename(
        columns={'NUMERATOR': POSSIBLE_COUNT_COLUMN})

    # Merge totsampcrossings to summary
    df_summary = pd.merge(df_summary,
                          df_totsampcrossings,
                          on=colset2,
                          how='outer')
    df_summary = df_summary.rename(
        columns={'DENOMINATOR': SAMPLED_COUNT_COLUMN})

    # Merge totsampshifts to summary
    df_summary = pd.merge(df_summary,
                          df_totsampshifts,
                          on=colset2,
                          how='outer')
    df_summary = df_summary.rename(columns={'DENOMINATOR': 'TEMP'})

    # Merge total sample crossings and total sample shifts to single column via addition
    df_summary[SAMPLED_COUNT_COLUMN] = df_summary[SAMPLED_COUNT_COLUMN].fillna(
        0) + df_summary.TEMP.fillna(0)

    df_summary = df_summary.drop(['TEMP'], 1)

    # Sort summaries
    df_summary_2 = df_summary.sort_values(colset2)

    # Re-index the data frames
    df_summary_2.index = range(df_summary_2.shape[0])

    # --------------------------------------------------------------------
    # Produce summary high
    # --------------------------------------------------------------------

    # Sort survey data
    df_surveydata_merge_3 = df_surveydata_merge.sort_values(colset3)

    # Group by the necessary columns and aggregate df_surveydata_merge shift weight
    df_summary_high = df_surveydata_merge_3.groupby(colset3)[shift_weight].agg(
        {
            COUNT_COLUMN: 'count',
            WEIGHT_SUM_COLUMN: 'sum',
            MIN_WEIGHT_COLUMN: 'min',
            AVERAGE_WEIGHT_COLUMN: 'mean',
            MAX_WEIGHT_COLUMN: 'max'
        })

    # Flatten summary high columns to single row after aggregation
    df_summary_high = df_summary_high.reset_index()

    # PS: round column
    df_summary_high[COUNT_COLUMN] = df_summary_high[COUNT_COLUMN].round(3)
    df_summary_high[AVERAGE_WEIGHT_COLUMN] = df_summary_high[
        AVERAGE_WEIGHT_COLUMN].round(3)
    df_summary_high[MIN_WEIGHT_COLUMN] = df_summary_high[
        MIN_WEIGHT_COLUMN].round(3)
    df_summary_high[AVERAGE_WEIGHT_COLUMN] = df_summary_high[
        AVERAGE_WEIGHT_COLUMN].round(3)
    df_summary_high[MAX_WEIGHT_COLUMN] = df_summary_high[
        MAX_WEIGHT_COLUMN].round(3)

    # Append total sample crossings and total sample shifts
    df_totsampshifts_appended = df_totsampshifts.append(df_totsampcrossings)

    # Re-index the data frame
    df_totsampshifts_appended.index = range(df_totsampshifts_appended.shape[0])

    # Sort total sample shifts
    df_totsampshifts_1 = df_totsampshifts_appended.sort_values(colset3)

    # Group by the necessary columns and aggregate df_totsampshifts shift weight
    df_summary_high_sampled = df_totsampshifts_1.groupby(
        colset3)['DENOMINATOR'].agg([(SAMPLED_COUNT_COLUMN, 'sum')])

    # Flatten summary high sampled columns to single row after aggregation
    df_summary_high_sampled = df_summary_high_sampled.reset_index()

    # Left merge summary high with summary high sampled
    df_summary_high_1 = pd.merge(df_summary_high,
                                 df_summary_high_sampled,
                                 on=SHIFTS_SUB_STRATA,
                                 how='left')

    # Append summary and summary high
    df_summary_3 = pd.concat([df_summary_high_1, df_summary_2])

    # Set summary columns
    df_summary_4 = df_summary_3[colset4]
    df_summary_5 = df_summary_4.sort_values([SUMMARY_KEY_COLUMN],
                                            ascending=True,
                                            kind='mergesort')
    df_summary_5.index = range(df_summary_5.shape[0])

    # replace 0 with nan to match SAS
    df_summary_5[SAMPLED_COUNT_COLUMN].replace(0, np.nan, inplace=True)

    # Set surveydata columns
    df_surveydata_merge_output = df_surveydata_merge_3[colset5]
    df_surveydata_merge_output_2 = df_surveydata_merge_output.sort_values(
        ['SERIAL'])

    # re-index the dataframe
    df_surveydata_merge_output_2.index = range(
        df_surveydata_merge_output_2.shape[0])

    final_output_data = df_surveydata_merge_output_2
    final_summary_data = df_summary_5

    # Create shift weight threshold data sets
    df_min_sw_check = df_summary_2[
        df_summary_2[SAMPLED_COUNT_COLUMN].notnull()
        & (df_summary_2[MIN_WEIGHT_COLUMN] < int(MINIMUM_WEIGHT_THRESHOLD))]
    df_max_sw_check = df_summary_2[
        df_summary_2[SAMPLED_COUNT_COLUMN].notnull()
        & (df_summary_2[MAX_WEIGHT_COLUMN] > int(MAXIMUM_WEIGHT_THRESHOLD))]

    # Merge shift weight threshold data sets
    df_sw_thresholds_check = pd.merge(df_min_sw_check,
                                      df_max_sw_check,
                                      on=colset1,
                                      how='outer')

    # Collect data outside of specified threshold

    if len(df_sw_thresholds_check) > 0:
        threshold_string = ""
        for index, record in df_sw_thresholds_check.iterrows():
            threshold_string += "___||___" \
                                + df_sw_thresholds_check.columns[0] + " : " + str(record[0]) + " | " \
                                + df_sw_thresholds_check.columns[1] + " : " + str(record[1]) + " | " \
                                + df_sw_thresholds_check.columns[2] + " : " + str(record[2]) + " | " \
                                + df_sw_thresholds_check.columns[3] + " : " + str(record[3])
        log.warning('Shift weight outside thresholds for: ' + threshold_string)

    return final_output_data, final_summary_data
def do_ips_minweight_calculation(df_surveydata, serial_num, shift_weight,
                                 nr_weight, min_weight):
    """
    Author       : James Burr
    Date         : Jan 2018
    Purpose      : Performs the calculation of minimums weights
    Parameters   : df_surveydata - dataframe containing the survey data
                 : var_serialNum - name of the column containing serial number
                 : var_shiftWeight - name of the column containing calculated shift_wt values
                 : var_NRWeight - name of the column containing calculated non_response_wt values
                 : var_minWeight - name of the column to contain calculated min_wt values
    Returns      : df_out, containing a list of serial numbers with the corresponding calculated mins_wt values
                 : df_summary, containing a summary of supporting variables related to mins_wt.
    Requirements : 
    Dependencies :
    """

    df_surveydata_new = df_surveydata[df_surveydata[shift_weight].notnull()]

    df_surveydata_new = df_surveydata_new[
        df_surveydata_new[nr_weight].notnull()]

    df_surveydata_new["MINS_CTRY_GRP_PV"].fillna(0, inplace=True)

    df_surveydata_new['SWNRwght'] = df_surveydata_new[
        shift_weight] * df_surveydata_new[nr_weight]

    df_surveydata_sorted = df_surveydata_new.sort_values(STRATA)

    # Summarise the minimum responses by the strata
    df_mins = df_surveydata_sorted[df_surveydata_sorted[MINIMUM_FLAG_COLUMN] ==
                                   1]

    df_mins.reset_index(inplace=True)

    df_summin = df_mins.groupby(STRATA)['SWNRwght'].agg({
        PRIOR_WEIGHT_MINIMUM_COLUMN:
        'sum',
        MINIMUM_COUNT_COLUMN:
        'count'
    })

    df_summin.reset_index(inplace=True)

    # Summarise only full responses by strata
    df_fulls = df_surveydata_sorted[df_surveydata_sorted[MINIMUM_FLAG_COLUMN]
                                    == 0]

    df_sumfull = df_fulls.groupby(STRATA)['SWNRwght'].agg({
        PRIOR_WEIGHT_FULL_COLUMN:
        'sum',
        FULL_RESPONSE_COUNT_COLUMN:
        'count'
    })

    df_sumfull.reset_index(inplace=True)

    # Summarise the mig slot interviews by the strata
    df_migs = df_surveydata_sorted[df_surveydata_sorted[MINIMUM_FLAG_COLUMN] ==
                                   2]

    df_summig = df_migs.groupby(STRATA)['SWNRwght'].agg(
        {"sumPriorWeightMigs": 'sum'})

    df_summig.reset_index(inplace=True)

    # Calculate the minimum weight by the strata
    df_summin.sort_values(STRATA)
    df_sumfull.sort_values(STRATA)
    df_summig.sort_values(STRATA)

    df_summary = pd.merge(df_sumfull, df_summig, on=STRATA, how='outer')

    df_summary = df_summary.merge(df_summin, on=STRATA, how='outer')

    df_check_prior_gross_fulls = df_summary[
        df_summary[PRIOR_WEIGHT_FULL_COLUMN] <= 0]

    # Collect data outside of specified threshold
    threshold_string = ""
    for index, record in df_check_prior_gross_fulls.iterrows():
        threshold_string += "___||___" \
                            + df_check_prior_gross_fulls.columns[0] + " : " + str(record[0])

    if not df_check_prior_gross_fulls.empty and not df_summig.empty:
        log.error('Error: No complete or partial responses' + threshold_string)
    else:
        df_summary[min_weight] = np.where(
            df_summary[PRIOR_WEIGHT_FULL_COLUMN] > 0,
            (df_summary[PRIOR_WEIGHT_MINIMUM_COLUMN] +
             df_summary[PRIOR_WEIGHT_FULL_COLUMN]) /
            df_summary[PRIOR_WEIGHT_FULL_COLUMN], 1)

    # Replace missing values with 0
    df_summary[PRIOR_WEIGHT_MINIMUM_COLUMN].fillna(0, inplace=True)
    df_summary[PRIOR_WEIGHT_FULL_COLUMN].fillna(0, inplace=True)
    df_summary["sumPriorWeightMigs"].fillna(0, inplace=True)

    df_summary[PRIOR_WEIGHT_ALL_COLUMN] = df_summary[PRIOR_WEIGHT_MINIMUM_COLUMN] + \
                                          df_summary[PRIOR_WEIGHT_FULL_COLUMN] + \
                                          df_summary["sumPriorWeightMigs"]

    df_summary = df_summary.sort_values(STRATA)

    df_summary[min_weight] = np.where(
        df_summary[PRIOR_WEIGHT_FULL_COLUMN] > 0,
        ((df_summary[PRIOR_WEIGHT_MINIMUM_COLUMN] +
          df_summary[PRIOR_WEIGHT_FULL_COLUMN]) /
         df_summary[PRIOR_WEIGHT_FULL_COLUMN]), df_summary[min_weight])

    df_surveydata_sorted.fillna(0, inplace=True)

    # This merge creates two mins_wt columns, x and y/
    df_out = df_summary.merge(df_surveydata_sorted, on=STRATA, how='outer')

    # Remove empty mins_wt_y column and rename mins_wt_x to mins_wt
    df_out = df_out.drop(min_weight + '_y', axis=1)

    df_out.rename(index=str,
                  columns={min_weight + '_x': min_weight},
                  inplace=True)

    df_out.sort_values(serial_num)

    df_test_pre = pd.DataFrame(columns=[min_weight, MINIMUM_FLAG_COLUMN])

    df_test_post_1 = pd.DataFrame(columns=[min_weight, MINIMUM_FLAG_COLUMN])

    df_test_post_2 = pd.DataFrame(columns=[min_weight, MINIMUM_FLAG_COLUMN])

    df_test_pre[min_weight] = df_out[min_weight]

    df_test_pre[MINIMUM_FLAG_COLUMN] = df_out[MINIMUM_FLAG_COLUMN]

    # Set mins_wt to either 0 or 1 conditionally, then calculate the postweight value
    df_out[min_weight] = np.where(df_out[MINIMUM_FLAG_COLUMN] == 1.0, 0,
                                  df_out[min_weight])

    df_test_post_1[min_weight] = df_out[min_weight]

    df_test_post_1[MINIMUM_FLAG_COLUMN] = df_out[MINIMUM_FLAG_COLUMN]

    df_out[min_weight] = np.where(df_out[MINIMUM_FLAG_COLUMN] == 2.0, 1,
                                  df_out[min_weight])

    df_test_post_2[min_weight] = df_out[min_weight]

    df_test_post_2[MINIMUM_FLAG_COLUMN] = df_out[MINIMUM_FLAG_COLUMN]

    df_out['SWNRMINwght'] = df_out[shift_weight] * \
                            df_out[nr_weight] * \
                            df_out[min_weight]

    df_out_sliced = df_out[df_out[MINIMUM_FLAG_COLUMN] != 1]
    df_postsum = df_out_sliced.groupby(STRATA)['SWNRMINwght'].agg({
        POST_WEIGHT_COLUMN:
        'sum',
        CASES_CARRIED_FORWARD_COLUMN:
        'count'
    })

    df_postsum.reset_index(inplace=True)

    df_postsum.sort_values(STRATA)

    # Merge the updated dataframe with specific columns from GNR.
    df_summary = df_summary.merge(df_postsum, on=STRATA, how='outer')

    df_summary.drop(["sumPriorWeightMigs"], axis=1, inplace=True)

    df_summary.sort_values(STRATA, inplace=True)

    # Perform data validation
    df_fulls_below_threshold = df_summary[
        df_summary[FULL_RESPONSE_COUNT_COLUMN] < 30]
    df_mins_below_threshold = df_summary[df_summary[MINIMUM_COUNT_COLUMN] > 0]

    df_merged_thresholds = df_fulls_below_threshold.merge(
        df_mins_below_threshold, how='inner')
    df_merged_thresholds = df_merged_thresholds[STRATA]

    # Collect data outside of specified threshold
    threshold_string = ""
    for index, record in df_merged_thresholds.iterrows():
        threshold_string += "___||___" \
                            + df_merged_thresholds.columns[0] + " : " + str(record[0]) + " | " \
                            + df_merged_thresholds.columns[1] + " : " + str(record[1])
    if len(df_merged_thresholds) > 0:
        log.warning('Minimums weight outside thresholds for: ' +
                    threshold_string)

    df_out = df_out[[serial_num, min_weight]]

    # This block of rounding was largely used to test and to bring the results closer in line with the SAS results.
    # They can be removed if desired in order to produce a new standard test set.
    df_out[min_weight] = df_out[min_weight].round(3)
    columns_to_round = [
        PRIOR_WEIGHT_ALL_COLUMN, PRIOR_WEIGHT_FULL_COLUMN,
        PRIOR_WEIGHT_MINIMUM_COLUMN, min_weight, POST_WEIGHT_COLUMN
    ]
    df_summary[columns_to_round] = df_summary[columns_to_round].round(3)

    df_out = df_out.sort_values(serial_num)

    df_summary["MINS_CTRY_GRP_PV"] = df_summary["MINS_CTRY_GRP_PV"].replace(
        0, float('nan'))

    return df_out, df_summary
Exemple #23
0
def do_ips_nrweight_calculation(survey_data, non_response_data,
                                non_response_weight_column, var_serial):
    """
    Author       : James Burr
    Date         : Jan 2018
    Purpose      : Performs calculations to find the nonresponse weight.
    Parameters   : survey_data = the IPS survey records for the period.
                 : non_response_data = SAS data set holding migrant non-response totals and
                 : ineligible totals by strata
                 : non_response_weight_column = Variable holding the name of the non-resp. weight field
                 : var_serial = Variable holding the name of the record number field
    Returns      : df_out - dataframe containing calculated values for non_response_weight
                 : df_summary - dataframe containing a list of various columns, including the calculated non_response_wt
    Requirements : 
    Dependencies : 
    """

    # drop NON_RESPONSE_WT column in survey data at start (this matches SAS log)
    if 'NON_RESPONSE_WT' in survey_data.columns:
        survey_data = survey_data.drop(columns=['NON_RESPONSE_WT'])

    # Formatting and fudgery
    # non_response_data['NR_PORT_GRP_PV'] = pd.to_numeric(non_response_data['NR_PORT_GRP_PV'], errors='coerce')
    # non_response_data['WEEKDAY_END_PV'] = pd.to_numeric(non_response_data['WEEKDAY_END_PV'], errors='coerce')
    # non_response_data.replace('None', np.nan, inplace=True)

    df_nonresponsedata_sorted = non_response_data.sort_values(SHIFTS_STRATA)

    survey_data['NR_PORT_GRP_PV'].fillna(0, inplace=True)
    survey_data['ARRIVEDEPART'].fillna(0, inplace=True)
    survey_data['WEEKDAY_END_PV'].fillna(0, inplace=True)
    df_surveydata_sorted = survey_data.sort_values(SHIFTS_STRATA)

    df_psw = df_surveydata_sorted.groupby(SHIFTS_STRATA)[PSW_COLUMN].agg(
        {PSW_COLUMN: 'mean'})

    # Flattens the column structure
    df_psw = df_psw.reset_index()

    # Only keep rows that exist in df_nonresponsedata_sorted
    df_grossmignonresp = pd.merge(df_nonresponsedata_sorted,
                                  df_psw,
                                  on=SHIFTS_STRATA,
                                  how='left')

    # Add gross values using the primary sampling weight and add two new columns
    # to df_grossmignonresp
    df_grossmignonresp['SHIFT_WT'].fillna(0, inplace=True)
    df_grossmignonresp['grossmignonresp'] = df_grossmignonresp[
        PSW_COLUMN] * df_grossmignonresp[NR_TOTALS_COLUMN]

    df_grossmignonresp['grossordnonresp'] = df_grossmignonresp[
        PSW_COLUMN] * df_grossmignonresp[NON_MIG_TOTALS_COLUMN]

    # Validate that non-response totals can be grossed
    df_migtotal_not_zero = df_grossmignonresp[
        df_grossmignonresp[NR_TOTALS_COLUMN] != 0]

    # TODO: Return error
    if len(df_migtotal_not_zero[
            df_migtotal_not_zero['grossmignonresp'].isnull()]) > 0:
        log.error('Unable to gross up non-response total.')

    # Summarise over non-response strata
    df_grossmignonresp = df_grossmignonresp.sort_values(NON_RESPONSE_STRATA)

    df_summignonresp = df_grossmignonresp.groupby(NON_RESPONSE_STRATA).agg({
        'grossmignonresp':
        'sum',
        'grossordnonresp':
        'sum'
    })

    # Flattens the column structure after adding the new grossmignonresp and grossordnonresp columns
    df_summignonresp = df_summignonresp.reset_index()

    df_summignonresp = df_summignonresp.rename(
        columns={'grossordnonresp': 'grossinelresp'})

    # Calculate the grossed number of respondents over the non-response strata

    # Use only records in which NR_FLAG_PV is 0
    df_surveydata_sliced = df_surveydata_sorted.loc[
        df_surveydata_sorted[NR_FLAG_COLUMN] == 0]

    df_surveydata_sliced = df_surveydata_sliced.sort_values(
        NON_RESPONSE_STRATA)

    # Create two new columns as aggregations of SHIFT_WT
    df_sumresp = df_surveydata_sliced.groupby(
        NON_RESPONSE_STRATA)[PSW_COLUMN].agg({
            GROSS_RESP_COLUMN: 'sum',
            RESP_COUNT_COLUMN: 'count'
        })

    # Flattens the column structure after adding the new gross_resp and count_resps columns
    df_sumresp = df_sumresp.reset_index()

    # Calculate the grossed number of T&T non-respondents of the non-response strata

    # Use only records from the survey dataset where the NR_FLAG_PV is 1, then sort
    df_surveydata_sliced = df_surveydata_sorted.loc[
        df_surveydata_sorted[NR_FLAG_COLUMN] == 1]

    df_surveydata_sliced = df_surveydata_sliced.sort_values(
        NON_RESPONSE_STRATA)

    # Create new column using the sum of ShiftWt
    df_sumordnonresp = df_surveydata_sliced.groupby(
        NON_RESPONSE_STRATA)[PSW_COLUMN].agg({'grossordnonresp': 'sum'})

    # Flattens the column structure after adding the new grossordnonresp column
    df_sumordnonresp = df_sumordnonresp.reset_index()

    # Sort values in the three dataframes required for the next calculation
    df_sumordnonresp = df_sumordnonresp.sort_values(NON_RESPONSE_STRATA)

    df_sumresp = df_sumresp.sort_values(NON_RESPONSE_STRATA)

    df_summignonresp = df_summignonresp.sort_values(NON_RESPONSE_STRATA)

    # Use the calculated data frames to calculate the non-response weight

    # Merge previously sorted dataframes into one, ensuring all rows from summignonresp are kept
    df_gnr = df_summignonresp.merge(df_sumresp,
                                    on=NON_RESPONSE_STRATA,
                                    how='outer')

    df_gnr = df_gnr.sort_values(NON_RESPONSE_STRATA)

    df_gnr = df_gnr.merge(df_sumordnonresp, on=NON_RESPONSE_STRATA, how='left')

    # Replace all NaN values in columns with zero's
    df_gnr['grossmignonresp'].fillna(0, inplace=True)
    df_gnr['grossinelresp'].fillna(0, inplace=True)
    df_gnr['grossordnonresp'].fillna(0, inplace=True)

    # Add in two new columns with checks to prevent division by 0
    df_gnr[GNR_COLUMN] = np.where(
        df_gnr[GROSS_RESP_COLUMN] != 0, df_gnr['grossordnonresp'] +
        df_gnr['grossmignonresp'] + df_gnr['grossinelresp'], 0)

    df_gnr[non_response_weight_column] = np.where(
        df_gnr[GROSS_RESP_COLUMN] != 0,
        (df_gnr[GNR_COLUMN] + df_gnr[GROSS_RESP_COLUMN]) /
        df_gnr[GROSS_RESP_COLUMN], np.NaN)

    df_gross_resp_is_zero = df_gnr[df_gnr[GROSS_RESP_COLUMN] == 0]

    # Collect data outside of specified threshold
    threshold_string = ""
    for index, record in df_gross_resp_is_zero.iterrows():
        threshold_string += "___||___" \
                            + df_gross_resp_is_zero.columns[0] + " : " + str(record[0])

    if len(df_gross_resp_is_zero) > 0:
        log.error('Gross response is 0.' + threshold_string)

    # Sort df_gnr and df_surveydata ready for producing summary
    df_gnr = df_gnr.sort_values(NON_RESPONSE_STRATA)

    # Ensure only complete or partial responses are kept
    df_surveydata_sorted = df_surveydata_sorted.loc[
        df_surveydata_sorted[NR_FLAG_COLUMN] == 0]

    # Produce summary by merging survey data and gnr data together, then sort
    df_out = df_surveydata_sorted.merge(df_gnr[NON_RESPONSE_STRATA +
                                               [non_response_weight_column]],
                                        on=NON_RESPONSE_STRATA,
                                        how='left')

    df_out = df_out.sort_values(NON_RESPONSE_STRATA)

    # Create and add three new columns calculated using SHIFT_WT
    df_summary = df_out.groupby(SHIFTS_STRATA)[PSW_COLUMN].agg({
        MEAN_SW_COLUMN:
        'mean',
        RESP_COUNT_COLUMN:
        'count',
        PRIOR_SUM_COLUMN:
        'sum'
    })

    # Flatten column structure
    df_summary.reset_index(inplace=True)

    # Create and add one new column calculated using 'non_response_wt' in a
    # different dataframe due to difficulty in creating all four new columns
    # simultaneously in a single dataframe
    df_summary_nr = df_out.groupby(
        SHIFTS_STRATA)[non_response_weight_column].agg(
            {MEAN_NRW_COLUMN: 'mean'})

    # Flatten column structure
    df_summary_nr.reset_index(inplace=True)

    # Merge all four new columns into the same dataframe
    df_summary = df_summary.merge(df_summary_nr, on=SHIFTS_STRATA, how='outer')

    # Merge the updated dataframe with specific columns from GNR.
    df_summary = df_gnr[NON_RESPONSE_STRATA +
                        [GNR_COLUMN, GROSS_RESP_COLUMN]].merge(
                            df_summary, on=NON_RESPONSE_STRATA, how='outer')

    # Calculate new non_response_wt value if condition is met
    df_out[non_response_weight_column] = np.where(
        df_out[MIG_FLAG_COLUMN] == 0,
        (df_out[non_response_weight_column] * df_out[TAND_TSI_COLUMN]) /
        df_out[MIG_SI_COLUMN], df_out[non_response_weight_column])

    # Perform data validation
    df_count_below_threshold = df_summary[df_summary[RESP_COUNT_COLUMN] > 0]
    df_gnr_below_threshold = df_summary[df_summary[GNR_COLUMN] > 0]

    df_merged_thresholds = df_count_below_threshold.merge(
        df_gnr_below_threshold, how='inner')

    df_merged_thresholds = df_merged_thresholds[
        df_merged_thresholds[RESP_COUNT_COLUMN] < 30]

    df_merged_thresholds = df_merged_thresholds[NON_RESPONSE_STRATA]

    # Collect data outside of specified threshold
    threshold_string = ""
    for index, record in df_merged_thresholds.iterrows():
        threshold_string += "___||___" \
                            + df_merged_thresholds.columns[0] + " : " + str(record[0]) + " | " \
                            + df_merged_thresholds.columns[1] + " : " + str(record[1])
    if len(df_merged_thresholds) > 0:
        log.warning('Respondent count below minimum threshold for : ' +
                    threshold_string)

    # Reduce output to just key value pairs
    df_out = df_out[[var_serial, non_response_weight_column]]

    return df_out, df_summary