def redist_nmsc_gc(df, metric):
    ''' Splits non-melanoma skin cancer data proportionately into subcauses
        ------
        Inputs:
            df : a mortality-incidence input dataset at stage 5
            metric : one of ['pop', 'cases', 'deaths']
    '''
    # subset data to split. exit if no split necessary
    nmsc_props = pp.get_nmsc_proportions()
    nmsc_props.rename(columns={'cause': 'acause'}, inplace=True)
    del nmsc_props['coding_system']
    to_split = df.loc[df['acause'].isin(nmsc_props['acause']), :]
    no_split = df.loc[~df['acause'].isin(nmsc_props['acause']), :]
    if len(to_split) == 0:
        return (df)
    print("disaggregating nmsc data...")
    # merge with proportions to split causes
    is_split = to_split.merge(nmsc_props,
                              on=['sex', 'acause', 'age'],
                              how='left',
                              indicator=True)
    assert not is_split['_merge'].isin(["left_only"]).all(), \
        "Error during merge with NMSC proportions"
    # apply proportions
    is_split.loc[:, 'acause'] = is_split['mapped_cause']
    is_split['split_value'] = is_split[metric]
    is_split.loc[:, metric] = is_split['proportion'] * is_split['split_value']
    output = no_split.append(is_split)
    pt.verify_metric_total(df, output, metric, 'NMSC')
    return (output.loc[:, no_split.columns.tolist()])
Example #2
0
def split_unknown_age(dataset, wgt_df, uid_cols, metric):
    ''' redistribute unkonwn ages
        --- Inputs ---
        dataset : DataFrame
                pandas dataframe
        metric  : string
                possible values: ['pop', 'cases', 'deaths']
    '''
    assert 'wgt' in wgt_df.columns, "Error: no wgt column sent in wgt_df"
    # Ensure that 'age' is not listed as a uid
    uids_noAge = [u for u in uid_cols if 'age' != u]
    uid_cols = uids_noAge + ['age']
    # Split unknown age
    unknown_age = dataset.loc[dataset['age'] == 26,
                              uids_noAge + [metric]].copy()
    unknown_age.rename(columns={metric: 'unknown_age_data'}, inplace=True)
    known_age = dataset.loc[dataset['age'] != 26, :].copy()
    with_weights = known_age.merge(wgt_df)
    astest.test_weights(known_age, with_weights)
    prop_df = pp.add_proportions(with_weights, uids_noAge)
    to_distribute = prop_df.merge(unknown_age)
    to_distribute.loc[:, 'unknown_age_data'].fillna(value=0, inplace=True)
    to_distribute['orig_data'] = to_distribute[metric].copy()
    to_distribute.loc[:, metric] += to_distribute['unknown_age_data'].multiply(
        to_distribute['proportion'])
    output = to_distribute[uid_cols + [metric]]
    output.loc[:, 'frmat'] = 131
    pt.verify_metric_total(dataset, output, metric, "split unknown age")
    return (output)
Example #3
0
def split_age(dataset, wgt_df, metric, uid_cols, replacement_col='gbd_age'):
    ''' Splits aggregate ages
        --- Inputs ---
        dataset : DataFrame
                pandas dataframe
                must contain "obs" column indicating the observation number
        wgt_df  : DataFrame
                    pandas dataframe containing expected numbers for the metric at each age
        metric  : string
                possible values: ['pop', 'cases', 'deaths']
        uid_cols : list
                column names indicating unique identifiers for the wgt_df (eg. cause, location_id)
        replacement_col: string
                name of the column containing the new age values in the wgt_df
    '''
    assert 'wgt' in wgt_df.columns, "Error: no wgt column sent in wgt_df"
    # Ensure that 'age' is not listed as a uid, since it's being split
    uids_noAge = [u for u in uid_cols if 'age' not in u]
    # mark those entries that need to be split. do not split if frmat==9 (unknown)
    standard_age_frmats = [0]
    standard_age_im_frmats = [1, 2, 8, 9]
    # fill to a standard im_frmat if frmat == 9
    dataset.loc[:, 'im_frmat'].fillna(value=9, inplace=True)
    dataset = add_missing_ages(dataset, uids_noAge, metric)
    # for each format type, mark which age categories need to be split per the
    #   corresponding format map. split only those categogies. can split
    #   multiple age formats at once
    dataset['need_split'] = 0
    dataset.loc[~dataset.frmat.isin(standard_age_frmats + [9])
                & ~dataset['age'].isin([26]), 'need_split'] = 1
    dataset.loc[~dataset.im_frmat.isin(standard_age_im_frmats)
                & dataset.frmat != 9
                & ~dataset['age'].isin([26]), 'need_split'] = 1
    # Split age for each format type
    df = dataset.loc[dataset.need_split == 1, :].copy(deep=True)
    for frmat_type in ['frmat', 'im_frmat']:
        df = apply_age_spilt_proportions(df, frmat_type, wgt_df, uid_cols,
                                         metric)
    # rename age formats to original name
    unadjusted = dataset.loc[dataset.need_split == 0, :].copy(deep=True)
    output = unadjusted.append(df)
    del output['need_split']
    output.loc[:, 'im_frmat'] = 9
    output.loc[output['frmat'] != 9, 'frmat'] = 131
    pt.verify_metric_total(dataset, output, metric, "split age")
    return (output)
def disaggregate_acause(df, ds_instance):
    ''' Description: Returns a dataframe in which metric values have been
            distributed across all associated acauses
        How it Works: Utilizes the create_metric_weights function to reshape
            the df so that acause is in long form. Adds proportions to
            each acause by observation, then applies those proportions to
            split the input metric value across the attributed acauses.
            Finally, collapses to re-combine data to single datapoints by
            gbd_cause and acause.
            NOTE: this process drops the 'cause' and 'cause_name' columns.

    '''
    # Ensure that 'age' is not listed as a uid
    metric = ds_instance.metric
    uids_noAcause = [c for c in md.get_uid_cols(5) if 'acause' not in c]
    acause_cols = [a for a in df.columns if 'acause' in a]
    all_uids = md.get_uid_cols(5)
    needs_split = (df['acause2'].notnull() & ~df['acause2'].isin([""]))
    to_split = df.loc[needs_split, :]
    no_split = df.loc[~needs_split, :]
    # If no split needed, simply return the datafraqme with a renamed acause1
    if len(to_split) == 0:
        df.rename(columns={'acause1': 'acause'}, inplace=True)
        acause_cols.remove('acause1')
        df.drop(labels=acause_cols, axis=1, inplace=True)
        return (df)
    print("disaggregating acause...")
    # create weights used for splitting
    weight_df = pp.create_metric_weights(df, all_uids, ds_instance)
    # calculate proportions based on the weights
    proportions_df = pp.add_proportions(weight_df, uids_noAcause)
    # adjust by proportions
    is_split = to_split.merge(proportions_df)
    is_split['split_value'] = is_split[metric]
    is_split.loc[:, metric] = is_split['proportion'] * is_split['split_value']
    is_split = md.stdz_col_formats(is_split)
    #
    no_split.rename(columns={'acause1': 'acause'}, inplace=True)
    acause_cols.remove('acause1')
    no_split.drop(labels=acause_cols, axis=1, inplace=True)
    #
    output = no_split.append(is_split)
    pt.verify_metric_total(df, output, metric, "disaggregate acause")
    return (output.loc[:, no_split.columns.tolist()])
Example #5
0
def apply_age_spilt_proportions(input_df, frmat_type, wgt_df, uid_cols,
                                metric):
    ''' combines weights with population to calculate proportions by which 
            combined age groups are to be split, then splits data by those
            proportions
    '''
    # remove dataset_id if present in dataframe
    split_input = input_df.copy()
    if 'dataset_id' in split_input.columns:
        del split_input['dataset_id']
    # merge with the age format map and get an expanded dataframe with the ages
    #   to be split
    uids_noAge = [u for u in uid_cols if 'age' != u]
    uid_cols = uids_noAge + ['age']
    marked_df = mark_ages_to_be_split(split_input, frmat_type, uid_cols,
                                      metric)
    to_expand = marked_df.loc[marked_df['to_expand'].eq(1), :].copy()
    if len(to_expand) == 0:
        return (split_input)
    # merge with expected values ("weights")
    to_expand.rename(columns={
        'age': 'split_age',
        'gbd_age': 'age'
    },
                     inplace=True)
    weighted_df = to_expand.merge(wgt_df)
    astest.test_weights(to_expand, weighted_df)
    # calculate proportions
    to_split = pp.add_proportions(weighted_df, uids_noAge + ['split_age'])
    # adjust by proportions
    to_split.loc[:, 'split_value'] = to_split[metric]
    to_split.loc[:, metric] = to_split['proportion'] * to_split['split_value']
    # collapse, then update format types of split data
    recombined_df = to_split.append(
        marked_df.loc[marked_df['to_expand'] == 0, :])
    adjusted_df = dft.collapse(recombined_df,
                               by_cols=uid_cols,
                               func='sum',
                               combine_cols=metric)
    astest.compare_pre_post_split(split_input, adjusted_df, metric)
    adjusted_df.loc[:, 'need_split'] = 1
    pt.verify_metric_total(split_input, adjusted_df, metric,
                           "apply age proportions")
    return (adjusted_df[split_input.columns.values])
def redist_kaposi(df, metric, uid_cols):
    ''' Adjusts Kaposi Sarcoma data to account for HIV-attributed cases.
        ------
        Inputs:
            df : a mortality-incidence input dataset at stage 5
            metric : one of ['pop', 'cases', 'deaths']
            uid_cols : list indicating column-set that uniquely identifies observations
    '''
    # subset data to split. exit if no split necessary
    kaposi_prop = pp.get_kaposi_proportions()
    kaposi_prop.rename(columns={'cause': 'acause'}, inplace=True)
    del kaposi_prop['coding_system']
    to_split = df.loc[df['acause'].isin(kaposi_prop['acause'].unique()), :]
    no_split = df.loc[~df['acause'].isin(kaposi_prop['acause'].unique()), :]
    if len(to_split) == 0:
        return (df)
    print("disaggregating kaposi sarcoma data...")
    # merge with weights
    to_split = to_split.merge(kaposi_prop,
                              on=['sex', 'acause', 'age'],
                              how='left',
                              indicator=True)
    assert not to_split['_merge'].isin(["left_only"]).any(), \
        "Error: Not all Kaposi data could be merged with proportions"
    # Mark those those data that are both kaposi and have the correct year range.
    to_split = add_year_id(to_split)
    within_range = ((to_split['year'] >= to_split['year_start']) &
                    (to_split['year'] <= to_split['year_end']))
    to_split.loc[within_range, 'match'] = 1
    split_groups = to_split.groupby(uid_cols, as_index=False)['match'].max()
    # Split only marked data
    is_split = to_split.merge(split_groups[split_groups['match'].isin([1])],
                              how='inner')
    is_split['split_value'] = is_split[metric]
    is_split.loc[:, metric] = is_split['proportion'] * is_split['split_value']
    is_split.loc[:, 'acause'] = is_split['target']
    # Format kaposi data that did not meet any year range criteria
    cant_split = to_split.merge(split_groups[split_groups['match'].isin([0])],
                                how='inner')
    cant_split = cant_split.loc[:, no_split.columns.tolist()].drop_duplicates()
    output = pd.concat([no_split, is_split, cant_split])
    pt.verify_metric_total(df, output, metric, 'kaposi_redist')
    return (output.loc[:, no_split.columns.tolist()])
Example #7
0
def validate_mapping(in_df, out_df, metric):
    ''' Tests the mapping output to verify results. Stops 
    '''
    uid_cols = md.get_uid_cols(3)
    test_results = {'missing entries': [], 'new entries': []}
    in_df = md.stdz_col_formats(in_df)
    out_df = md.stdz_col_formats(out_df)
    test_df = pd.merge(in_df[uid_cols], out_df[uid_cols],
                       on=uid_cols, how='outer', indicator=True)
    if test_df['_merge'].isin(["left_only"]).any():
        test_results['missing entries'] = \
            test_df.loc[test_df._merge.eq("left_only"),uid_cols].to_dict()
    if test_df['_merge'].eq("right_only").any():
        test_results['new entries'] = \
            test_df.loc[test_df._merge.eq("right_only"), uid_cols].to_dict()
    if len(out_df) != len(in_df):
        test_results['missing or extra uids'] = "fail"
    pt.verify_metric_total(in_df, out_df, metric, "mapping test")
    tests.checkTestResults(test_results, 'validate mapping', displayOnly=False)
def main(dataset_id, data_type_id):
    ''' Disaggregates uids that are mapped to multiple gbd causes, including
            garbage codes, Kaposi Sarcoma, and non-melanoma skin cancer
    '''
    # prep_step 5 = cause_disaggregation
    this_dataset = md.MI_Dataset(dataset_id, 5, data_type_id)
    input_data = this_dataset.load_input()
    metric = this_dataset.metric
    uid_cols = md.get_uid_cols(5)
    input_data = input_data.loc[
        ~input_data['age'].isin([26, 3, 4, 5, 6, 91, 92, 93, 94]), :]
    # Format and add observation numbers
    formatted_input = prep_for_disagg(input_data.copy(), uid_cols, metric)
    # Disaggregate
    disaggregated_df = core.disaggregate_acause(formatted_input, this_dataset)
    # update uid columns to account for reshaped acause
    uid_cols = [u for u in uid_cols if 'acause' not in u] + ['acause']
    #
    kaposi_df = core.redist_kaposi(disaggregated_df, metric, uid_cols)
    if data_type_id == 2:
        adjusted_df = core.redist_nmsc_gc(kaposi_df, metric)
    else:
        adjusted_df = kaposi_df
    final_df = core.map_remaining_garbage(adjusted_df, data_type_id)
    # run test functions and save output
    pt.verify_metric_total(input_data, adjusted_df, metric,
                           "cause disaggregation module")
    # collapse to incorperate newly-split data
    output_uids = md.get_uid_cols(6)
    final_df = md.stdz_col_formats(final_df)
    final_df = dft.collapse(final_df,
                            by_cols=output_uids,
                            func='sum',
                            combine_cols=metric)
    # save
    md.complete_prep_step(final_df, this_dataset)
    print("Acause disaggregated")