Beispiel #1
0
def validate_mapping(in_df, out_df, metric):
    ''' Tests the mapping output to verify results. Stops 
    '''
    uid_cols = md.get_uid_cols(3)
    test_results = {'missing entries': [], 'new entries': []}
    in_df = md.stdz_col_formats(in_df)
    out_df = md.stdz_col_formats(out_df)
    test_df = pd.merge(in_df[uid_cols], out_df[uid_cols],
                       on=uid_cols, how='outer', indicator=True)
    if test_df['_merge'].isin(["left_only"]).any():
        test_results['missing entries'] = \
            test_df.loc[test_df._merge.eq("left_only"),uid_cols].to_dict()
    if test_df['_merge'].eq("right_only").any():
        test_results['new entries'] = \
            test_df.loc[test_df._merge.eq("right_only"), uid_cols].to_dict()
    if len(out_df) != len(in_df):
        test_results['missing or extra uids'] = "fail"
    pt.verify_metric_total(in_df, out_df, metric, "mapping test")
    tests.checkTestResults(test_results, 'validate mapping', displayOnly=False)
Beispiel #2
0
def get_age_frmat_map(frmat_type):
    ''' Loads map with indicators for which age groups need splitting
    '''
    if frmat_type == "im_frmat":
        resource = pd.read_csv(
            utils.get_path('im_frmat_map', process="mi_dataset"))
    elif frmat_type == "frmat":
        resource = pd.read_csv(
            utils.get_path('frmat_map', process="mi_dataset"))
    resource = md.stdz_col_formats(
        resource, additional_float_stubs=['age_specific', 'age_split'])
    return (resource)
Beispiel #3
0
def manage_no_split(df, metric_name, uid_cols, this_dataset):
    ''' finalize and save dataset where no split is needed
    '''
    uids_noAge = [c for c in uid_cols if 'age' not in c]
    # collapse remaining under5 and 80+ ages
    df = md.stdz_col_formats(df)
    df[metric_name].fillna(value=0, inplace=True)
    final_df = df.copy(deep=True)
    # final_df =  md.collapse_youngAndOld_ages(df, uids_noAge, metric_name)
    #
    if metric_name == "pop":
        md.complete_prep_step(final_df, this_dataset, is_pop=True)
    else:
        md.complete_prep_step(final_df, this_dataset, is_pop=False)
    return(None)
def disaggregate_acause(df, ds_instance):
    ''' Description: Returns a dataframe in which metric values have been
            distributed across all associated acauses
        How it Works: Utilizes the create_metric_weights function to reshape
            the df so that acause is in long form. Adds proportions to
            each acause by observation, then applies those proportions to
            split the input metric value across the attributed acauses.
            Finally, collapses to re-combine data to single datapoints by
            gbd_cause and acause.
            NOTE: this process drops the 'cause' and 'cause_name' columns.

    '''
    # Ensure that 'age' is not listed as a uid
    metric = ds_instance.metric
    uids_noAcause = [c for c in md.get_uid_cols(5) if 'acause' not in c]
    acause_cols = [a for a in df.columns if 'acause' in a]
    all_uids = md.get_uid_cols(5)
    needs_split = (df['acause2'].notnull() & ~df['acause2'].isin([""]))
    to_split = df.loc[needs_split, :]
    no_split = df.loc[~needs_split, :]
    # If no split needed, simply return the datafraqme with a renamed acause1
    if len(to_split) == 0:
        df.rename(columns={'acause1': 'acause'}, inplace=True)
        acause_cols.remove('acause1')
        df.drop(labels=acause_cols, axis=1, inplace=True)
        return (df)
    print("disaggregating acause...")
    # create weights used for splitting
    weight_df = pp.create_metric_weights(df, all_uids, ds_instance)
    # calculate proportions based on the weights
    proportions_df = pp.add_proportions(weight_df, uids_noAcause)
    # adjust by proportions
    is_split = to_split.merge(proportions_df)
    is_split['split_value'] = is_split[metric]
    is_split.loc[:, metric] = is_split['proportion'] * is_split['split_value']
    is_split = md.stdz_col_formats(is_split)
    #
    no_split.rename(columns={'acause1': 'acause'}, inplace=True)
    acause_cols.remove('acause1')
    no_split.drop(labels=acause_cols, axis=1, inplace=True)
    #
    output = no_split.append(is_split)
    pt.verify_metric_total(df, output, metric, "disaggregate acause")
    return (output.loc[:, no_split.columns.tolist()])
Beispiel #5
0
def run_split_pop(this_dataset):
    ''' split population data in combined age groups
    '''
    print("   splitting population...")
    uid_cols = md.get_uid_cols(prep_type_id=4, is_pop=True)
    df = this_dataset.load_pop(prep_type_id=1)
    # update column names (some old 00_pop files still have registry_id column 
    #   instead of registry_index)
    if "registry_id" in df.columns:  
        df.rename(columns={'registry_id': 'registry_index'}, inplace=True)
    # Exit if no population present
    if len(df) == 0:
        return(None)
    # Temporarily reshape and updated dataframe until input is no longer in STATA
    uids_noAge = [c for c in uid_cols if 'age' not in c]
    df = dft.wide_to_long(df,
                          stubnames='pop',
                          i=uids_noAge,
                          j='age',
                          drop_others=True)
    df = df.loc[df.age != 1, :]  # drop 'all ages'
    pop_cols = [p for p in df.columns.values if "pop" in p]
    df.loc[:, pop_cols].fillna(value=0, inplace=True)
    df = md.stdz_col_formats(df)
    if not (pt.has_combinedSex(df) |
            pt.has_age_unk(df, "pop") |
            pt.has_nonStdAge(df)
            ):
        manage_no_split(df, "pop", uid_cols, this_dataset)
    else:
        # replace missing data with zeroes
        uid_cols += ['location_id', 'country_id', 'year']
        df = md.add_location_ids(df)
        df = modeled_locations.add_country_id(df)
        # data with no country_id have no population estimates.
        #   global estimate should be used to generate weights
        df.loc[df['country_id'] == 0, 'location_id'] = 1
        # add mean year to faciliate merge with population weights
        df = add_year_id(df)
        # Split data
        manage_split(df, "pop", uid_cols, this_dataset)
    return(None)
def main(dataset_id, data_type_id):
    ''' Disaggregates uids that are mapped to multiple gbd causes, including
            garbage codes, Kaposi Sarcoma, and non-melanoma skin cancer
    '''
    # prep_step 5 = cause_disaggregation
    this_dataset = md.MI_Dataset(dataset_id, 5, data_type_id)
    input_data = this_dataset.load_input()
    metric = this_dataset.metric
    uid_cols = md.get_uid_cols(5)
    input_data = input_data.loc[
        ~input_data['age'].isin([26, 3, 4, 5, 6, 91, 92, 93, 94]), :]
    # Format and add observation numbers
    formatted_input = prep_for_disagg(input_data.copy(), uid_cols, metric)
    # Disaggregate
    disaggregated_df = core.disaggregate_acause(formatted_input, this_dataset)
    # update uid columns to account for reshaped acause
    uid_cols = [u for u in uid_cols if 'acause' not in u] + ['acause']
    #
    kaposi_df = core.redist_kaposi(disaggregated_df, metric, uid_cols)
    if data_type_id == 2:
        adjusted_df = core.redist_nmsc_gc(kaposi_df, metric)
    else:
        adjusted_df = kaposi_df
    final_df = core.map_remaining_garbage(adjusted_df, data_type_id)
    # run test functions and save output
    pt.verify_metric_total(input_data, adjusted_df, metric,
                           "cause disaggregation module")
    # collapse to incorperate newly-split data
    output_uids = md.get_uid_cols(6)
    final_df = md.stdz_col_formats(final_df)
    final_df = dft.collapse(final_df,
                            by_cols=output_uids,
                            func='sum',
                            combine_cols=metric)
    # save
    md.complete_prep_step(final_df, this_dataset)
    print("Acause disaggregated")
Beispiel #7
0
def main(dataset_id, data_type_id):
    ''' Maps data with the following steps:
            1) imports input data
            2) runs mapping function
            3) expands icd codes to fill garbage acause(n) where necessary
            4) applies sex and cause restrictions
    '''
    this_dataset = md.MI_Dataset(dataset_id, 3, data_type_id)
    metric_name = this_dataset.metric
    uid_cols = md.get_uid_cols(3)
    input_data = this_dataset.load_input().reset_index(drop=True)
    input_data.loc[input_data['im_frmat'].isnull() & input_data['frmat'].eq(9), 
                            'im_frmat'] = 9
    df = md.stdz_col_formats(input_data)
    #  Ensure that there is no "all age" data. 
    df = df.loc[df['age'] != 1, :]
    # Map data and apply restrictions
    mapped_df = map_data(df, this_dataset)
    restricted_df = cm.restrict_causes(mapped_df, 
                                        cause_col='gbd_cause',
                                        data_type_id=data_type_id,
                                        restrict_age=False)
    md.complete_prep_step(restricted_df, this_dataset)
    print("mapping complete.\n")
def run(dataset_id, data_type_id, uid):
    ''' Preps data for recalculation then recalculates as necessary
    '''
    this_dataset = md.MI_Dataset(dataset_id, 2, data_type_id)
    dataset_name = this_dataset.name
    metric = this_dataset.metric
    input_file = run_sr.get_sr_file(this_dataset, "sr_input")
    # Exit if output already exists
    output_file = run_sr.get_sr_file(this_dataset, 'split_output', uid)
    print(output_file)
    if os.path.exists(output_file):
        print("     output file found for uid " + str(uid))
        return(None)
    #
    negative_data_ok = is_exception(dataset_id, data_type_id)
    error_folder = utils.get_path("mi_input", base='j_temp')
    subcause_issue_file = '{}/subcause_issue/{}_{}_uid_{}.txt'.format(
        error_folder, dataset_name, data_type_id, uid)
    exception_file = '{}/negative_data/{}_{}_uid_{}.csv'.format(
        error_folder, dataset_name, data_type_id, uid)
    for d in [subcause_issue_file, exception_file, error_folder]:
        utils.ensure_dir(d)
    #
    print("    removing subtotals from uid {}...".format(uid))
    # add data for the given uid
    df = pd.read_hdf(input_file, 'split_{}'.format(uid))
    # Create a list of possible codes so that decimal subcauses are only added 
    #   if available
    input_cause_list = sorted(df['orig_cause'].unique().tolist())
    # create a dictionary for codes in the selected uid and attach the uid's 
    #   data
    uid_subset = {}
    input_data = {}
    # process decimals first and ranges last to ensure that nested causes are 
    #   removed
    for c in sorted(df['orig_cause'].unique().tolist()):
        uid_subset[c] = {}
        input_data[c] = {}
        uid_subset[c]['codes'] = []
        uid_subset[c]['subcodes'] = []
        if "-" not in c and "," not in c:
            uid_subset[c]['codes'].append(c)
            # add subcodes to 'subcode' key
            df.loc[df['orig_cause'].eq(c), 'cause'].dropna().unique().tolist()
            for subcode in sorted(df['cause'].where(df['orig_cause'] == c
                    ).dropna().unique().tolist()):
                if subcode != c:
                    uid_subset[c]['subcodes'].append(subcode)
            # if none of the subcodes appear in the list, set the cause as a 
            #   subcode of itself (prevents the addition of unused decimal 
            #   causes)
            if not len(uid_subset[c]['subcodes']):
                uid_subset[c]['subcodes'] = uid_subset[c]['codes']
            elif (not any('{}.'.format(sub[:3]) in check 
                    for check in input_cause_list 
                        for sub in uid_subset[c]['subcodes'])):
                uid_subset[c]['subcodes'] = uid_subset[c]['codes']
        else:
            for code in sorted(df['cause'].where(
                df['orig_cause'].eq(c)).dropna().unique().tolist()):
                uid_subset[c]['codes'].append(code)
                uid_subset[c]['subcodes'].append(code)
        # create other lists associated with the cause and add the metric data
        uid_subset[c]['subcauses_remaining'] = []
        uid_subset[c]['codes_removed'] = []
        uid_subset[c]['causes_removed'] = []
        uid_subset[c]['data'] = df.loc[df['cause'].eq(c),
                                        ['age', metric]].set_index('age')
        input_data[c]['data'] = uid_subset[c]['data']
        input_data[c]['codes'] = uid_subset[c]['codes']
    # Determine subcauses and highest number of causes remaining (how many 
    #   subcauses are contained within each cause)
    uid_set = set_subcauses(uid_subset, subcause_issue_file)
    highest_level = determine_highest_level(uid_set)
    # remove lowest level codes from parent causes
    if highest_level == 0:
        print('     no subcauses present.')
    else:
        subcauses_removed = True
        while subcauses_removed:
            uid_set, subcauses_removed = remove_subcauses(
                uid_set, uid, exception_file)
            # remove duplicates
            uid_set = remove_duplicates(uid_set)
            # re-set subcauses and num_subcause_remaining
            uid_set, highest_level = set_subcauses(
                uid_set, subcause_issue_file,)
            print("     subcauses removed.")
    # Prepare Output
    print("saving output...")
    output = pd.DataFrame(
        columns=['cause', 'codes_remaining', 'codes_removed', 'age', metric])
    for c in uid_set:
        # format cause information
        cause_data = pd.DataFrame(
            columns=['cause', 'codes_remaining', 'codes_removed'])
        cause_data.loc[0, ['cause']] = c
        # if nothing was removed, or there was only a single cause, or all of 
        #   the input codes are still present, set the codes remaining as the 
        #   cause
        if (not len(uid_set[c]['codes_removed']) or 
            ("-" not in c and "," not in c) or 
            set(input_data[c]['codes']) <= set(uid_set[c]['codes'])):
            cause_data.loc[0, ['codes_remaining']] = c
        else:
            cause_data.loc[0, ['codes_remaining']] = ','.join(
                convert_to_range(uid_set[c]['codes']))
        cause_data.loc[0, ['codes_removed']] = ','.join(
            convert_to_range(uid_set[c]['codes_removed']))
        # format output data
        output_data = uid_set[c]['data']
        output_data['age'] = output_data.index
        output_data['cause'] = c
        orig_data = input_data[c]['data']
        orig_data['age'] = orig_data.index
        orig_data = orig_data.rename(
            columns={metric: 'orig_metric_value'})
        orig_data['cause'] = c
        # combine and add to output
        final = pd.merge(output_data, cause_data, on='cause')
        final = pd.merge(final, orig_data, on=['cause', 'age'])
        output = output.append(final)
    # Create output dataset
    output['uniqid'] = uid
    # Update encoding (bug fix to work around pandas to_stata issue)
    output = md.stdz_col_formats(output, additional_float_stubs='uniqid')
    # Export results
    output.to_csv(output_file, index=False)
    print('\n Done!')
    time.sleep(1)
    return(None)