예제 #1
0
def main(dataset_id, data_type_id):
    ''' Maps data with the following steps:
            1) imports input data
            2) runs mapping function
            3) expands icd codes to fill garbage acause(n) where necessary
            4) applies sex and cause restrictions
    '''
    this_dataset = md.MI_Dataset(dataset_id, 3, data_type_id)
    metric_name = this_dataset.metric
    uid_cols = md.get_uid_cols(3)
    input_data = this_dataset.load_input().reset_index(drop=True)
    input_data.loc[input_data['im_frmat_id'].isnull()
                   & input_data['frmat_id'].eq(9), 'im_frmat_id'] = 9
    df = md.stdz_col_formats(input_data)
    #  Ensure that there is no "all age" data. Remove this line after updating
    #   the preceeding steps to stop using the old cancer age formats
    df = df.loc[df['age'] != 1, :]
    # Map data and apply restrictions
    mapped_df = map_data(df, this_dataset)
    restricted_df = cm.restrict_causes(mapped_df,
                                       cause_col='gbd_cause',
                                       data_type_id=data_type_id,
                                       restrict_age=False)
    md.complete_prep_step(restricted_df, this_dataset)
    print("mapping complete.\n")
예제 #2
0
def disaggregate_acause(df, ds_instance):
    ''' Description: Returns a dataframe in which metric values have been
            distributed across all associated acauses
        How it Works: Utilizes the create_metric_weights function to reshape
            the df so that acause is in long form. Adds proportions to
            each acause by observation, then applies those proportions to
            split the input metric value across the attributed acauses.
            Finally, collapses to re-combine data to single datapoints by
            gbd_cause and acause.
            NOTE: this process drops the 'cause' and 'cause_name' columns.

    '''
    # Ensure that 'age' is not listed as a uid
    metric = ds_instance.metric
    uids_noAcause = [c for c in md.get_uid_cols(5) if 'acause' not in c]
    acause_cols = [a for a in df.columns if 'acause' in a]
    all_uids = md.get_uid_cols(5)
    needs_split = (df['acause2'].notnull() & ~df['acause2'].isin([""]))
    to_split = df.loc[needs_split, :]
    no_split = df.loc[~needs_split, :]
    # If no split needed, simply return the datafraqme with a renamed acause1
    if len(to_split) == 0:
        df.rename(columns={'acause1': 'acause'}, inplace=True)
        acause_cols.remove('acause1')
        df.drop(labels=acause_cols, axis=1, inplace=True)
        return (df)
    print("disaggregating acause...")
    # create weights used for splitting
    weight_df = pp.create_metric_weights(df, all_uids, ds_instance)
    # calculate proportions based on the weights
    proportions_df = pp.add_proportions(weight_df, uids_noAcause)
    # adjust by proportions
    is_split = to_split.merge(proportions_df)
    is_split['split_value'] = is_split[metric]
    is_split.loc[:, metric] = is_split['proportion'] * is_split['split_value']
    is_split = md.stdz_col_formats(is_split)
    #
    no_split.rename(columns={'acause1': 'acause'}, inplace=True)
    acause_cols.remove('acause1')
    no_split.drop(labels=acause_cols, axis=1, inplace=True)
    #
    output = no_split.append(is_split)
    pt.verify_metric_total(df, output, metric, "disaggregate acause")
    return (output.loc[:, no_split.columns.tolist()])
예제 #3
0
def main(dataset_id, data_type_id, split_num):
    '''
    '''
    # Load input
    metric_dict = {'2': 'cases', '3': 'deaths'}
    this_dataset = md.MI_Dataset(dataset_id, 6, data_type_id)
    metric_name = this_dataset.metric
    rdp_input = manager.get_rdp_file(this_dataset, 'rdp_input')
    input_data = pd.read_hdf(rdp_input, 'split_{}'.format(split_num))

    # rename sex_id until rdp packages names are updated
    input_data.rename(columns={'sex_id': 'sex'}, inplace=True)

    # Redistribute data where possible
    if not manager.needs_rdp(input_data, this_dataset):
        print("    no redistribution needed for ds {} type {} split {}".format(
            dataset_id, data_type_id, split_num))
        save_worker_output(input_data, this_dataset, split_num)
        return (input_data)
    else:
        print("    redistributing ds {} type {} split {}".format(
            dataset_id, data_type_id, split_num))
        # Add maps to enable RDP
        input_data.rename(columns={'uid': 'split_group'}, inplace=True)
        mapped = add_location_hierarchy_info(input_data)
        # RDP cannot run without location metadata, and should not run for hiv
        #   Set aside those data
        skip_rdp_mask = cannot_redistribute(mapped)
        set_aside = mapped.loc[skip_rdp_mask, input_data.columns.tolist()]
        to_redistribute = mapped.loc[~skip_rdp_mask, :]
        # Redistribute remaining data
        if to_redistribute.any().any():
            rdp_results = run_rdp_core(to_redistribute, this_dataset,
                                       split_num)
            # Recombine
            if set_aside.any().any():
                rdp_results = rdp_results.append(set_aside, ignore_index=True)
            to_finalize = rdp_results
        else:
            print("    No data to redistribute. Finalizing.")
            to_finalize = input_data.rename(columns={'cause': 'acause'})
        output_cols = md.get_uid_cols(7)
        # rename sex_id until rdp packages get updated
        output_cols = ['sex' if x == 'sex_id' else x for x in output_cols]

        to_finalize = cm.correct_causes(to_finalize)
        finalized_df = dft.collapse(to_finalize,
                                    by_cols=output_cols,
                                    stub=metric_name)
        # Check totals (note: because of data precision, data before and after
        #   may not be precisely equivalent)
        diff = finalized_df[metric_name].sum() - input_data[metric_name].sum()
        assert abs(diff/input_data[metric_name].sum()) < 5, \
                    "Difference from input after rdp is too large"
        save_worker_output(finalized_df, this_dataset, split_num)
        return (finalized_df)
예제 #4
0
def update_redistributed_acause(df, ds_instance, split_num):
    ''' Returns dataframe (df) after merging with maps to update cause information
        -- Maps:
            decimal cause map : used to revert cause names to decimal form
            cause map : used to validate output causes
    '''
    metric_name = ds_instance.metric
    output_uids = md.get_uid_cols(7)
    output_uids = ['sex' if x == 'sex_id' else x for x in output_uids]

    #

    def manage_rdp_remnants(df, temp_folder, split_num, metric):
        ''' Verifies if any garbage remains after
        '''
        # Get any codes that didn't merge and save them
        rdp_error = ((df['acause'].isnull() | (df['_merge'] == 'left_only'))
                     & df[ds_instance.metric].isin([np.nan, 0]))
        rdp_error_list = sorted(df.loc[rdp_error, 'cause'].unique().tolist())
        if len(rdp_error_list):
            print("The following causes are not in the cause map:")
            print(rdp_error_list)
        return (None)

    # Convert acause back to cancer cause
    code_format_updates = {  # not necessary once rdp uses the map in the cancer db
        'C0': 'C00',
        'C1': 'C01',
        'C2': 'C02',
        'C3': 'C03',
        'C4': 'C04',
        'C4A': 'C04',
        'C5': 'C05',
        'C6': 'C06',
        'C7': 'C07',
        'C8': 'C08',
        'C9': 'C09',
        'neo_other': 'neo_other_cancer'
    }
    for key, value in code_format_updates.items():
        df.loc[df['acause'] == key, 'acause'] = value
    # Merge with cause map
    df.rename(columns={'acause': 'cause'}, inplace=True)  # No map
    cause_map = cm.load_rdp_cause_map(ds_instance.data_type_id)
    df = df.merge(cause_map,
                  how='left',
                  on=['coding_system', 'cause'],
                  indicator=True)
    # Check that all data were mapped to cause
    manage_rdp_remnants(df, ds_instance.temp_folder, split_num, metric_name)
    # Reformat to merge data with original source
    df = df.loc[:, output_uids + [metric_name]]
    final_df = dft.collapse(df, by_cols=output_uids, stub=metric_name)
    return (final_df)
예제 #5
0
def main(dataset_id, data_type_id):
    '''
    '''
    # prep_step 5 = cause_disaggregation
    this_dataset = md.MI_Dataset(dataset_id, 5, data_type_id)
    input_data = this_dataset.load_input()
    metric = this_dataset.metric
    uid_cols = md.get_uid_cols(5)
    input_data = input_data.loc[~input_data['age'].isin(
        [26, 3, 4, 5, 6, 91, 92, 93, 94]), :]
    # Format and add observation numbers
    formatted_input = prep_for_disagg(input_data.copy(), uid_cols, metric)
    # Disaggregate
    disaggregated_df = core.disaggregate_acause(formatted_input, this_dataset)
    # update uid columns to account for reshaped acause
    uid_cols = [u for u in uid_cols if 'acause' not in u] + ['acause']
    #
    kaposi_df = core.redist_kaposi(disaggregated_df, metric, uid_cols)
    if data_type_id == 2:
        adjusted_df = core.redist_nmsc_gc(kaposi_df, metric)
    else:
        adjusted_df = kaposi_df
    final_df = core.map_remaining_garbage(adjusted_df, data_type_id)
    # run test functions and save output
    pt.verify_metric_total(input_data, adjusted_df,
                           metric, "cause disaggregation module")
    # collapse to incorperate newly-split data
    output_uids = md.get_uid_cols(6)
    final_df = md.stdz_col_formats(final_df)
    final_df = dft.collapse(final_df,
                            by_cols=output_uids,
                            func='sum',
                            combine_cols=metric
                            )
    # save
    md.complete_prep_step(final_df, this_dataset)
    print("Acause disaggregated")
예제 #6
0
def run_split_metric(this_dataset):
    '''
    '''
    print("   splitting metric data...")
    uid_cols = md.get_uid_cols(4)
    metric_name = this_dataset.metric
    input_data = this_dataset.load_input()
    # save if no splitting is needed
    if not (pt.has_combinedSex(input_data) |
            pt.has_age_unk(input_data, metric_name) |
            pt.has_nonStdAge(input_data)):
        manage_no_split(input_data, metric_name, uid_cols, this_dataset)
    else:
        manage_split(input_data, metric_name, uid_cols, this_dataset)
    return(None)
예제 #7
0
파일: run_sr.py 프로젝트: cheth-rowe/ihmexp
def reformat_input(df, ds_instance):
    ''' Collapse and reshape input data from standardize_format output
    '''
    metric_name = ds_instance.metric
    uid_cols = md.get_uid_cols(2)
    wide_uid_cols = [u for u in uid_cols if 'age' not in u]
    uids_noCause = [u for u in uid_cols if 'cause' not in u]
    df.loc[df['im_frmat_id'].isnull() & df['frmat_id'].isin([9]), 'im_frmat_id'] = 9
    df = md.stdz_col_formats(df)
    df = dft.collapse(df, by_cols=wide_uid_cols, func='sum', stub=metric_name)
    df = dft.wide_to_long(df, stubnames=metric_name,
                          i=wide_uid_cols, j='age')
    df = df.groupby(uid_cols, as_index=False)[metric_name].sum()
    df[metric_name].fillna(value=0, inplace=True)
    df = md.stdz_col_formats(df)
    df = dft.make_group_id_col(df, uids_noCause, id_col='uniqid')
    return(df)
예제 #8
0
def run_split_pop(this_dataset):
    '''
    '''
    print("   splitting population...")
    uid_cols = md.get_uid_cols(prep_type_id=4, is_pop=True)
    df = this_dataset.load_pop(prep_type_id=1)
    if "registry_id" in df.columns:  # some old 00_pop files still have registry_id column instead of registry_index
        df.rename(columns={'registry_id': 'registry_index'}, inplace=True)
    # Exit if no population present
    if len(df) == 0:
        return(None)
    # Temporarily reshape and updated dataframe until input is no longer in STATA
    uids_noAge = [c for c in uid_cols if 'age' not in c]
    df = dft.wide_to_long(df,
                          stubnames='pop',
                          i=uids_noAge,
                          j='age',
                          drop_others=True)
    df = df.loc[df.age != 1, :]  # drop 'all ages'
    pop_cols = [p for p in df.columns.values if "pop" in p]
    df.loc[:, pop_cols].fillna(value=0, inplace=True)
    df = md.stdz_col_formats(df)
    if not (pt.has_combinedSex(df) |
            pt.has_age_unk(df, "pop") |
            pt.has_nonStdAge(df)
            ):
        manage_no_split(df, "pop", uid_cols, this_dataset)
    else:
        # replace missing data with zeroes
        uid_cols += ['location_id', 'country_id', 'year']
        df = md.add_location_ids(df)
        df = modeled_locations.add_country_id(df)
        # data with no country_id have no population estimates.
        #   global estimate should be used to generate weights
        df.loc[df['country_id'] == 0, 'location_id'] = 1
        # add mean year to faciliate merge with population weights
        df = gct.add_year_id(df)
        # Split data
        manage_split(df, "pop", uid_cols, this_dataset)
    return(None)
예제 #9
0
def validate_mapping(in_df, out_df, metric):
    ''' Tests the mapping output to verify results.  
    '''
    uid_cols = md.get_uid_cols(3)
    test_results = {'missing entries': [], 'new entries': []}
    in_df = md.stdz_col_formats(in_df)
    out_df = md.stdz_col_formats(out_df)
    test_df = pd.merge(in_df[uid_cols],
                       out_df[uid_cols],
                       on=uid_cols,
                       how='outer',
                       indicator=True)
    if test_df['_merge'].isin(["left_only"]).any():
        test_results['missing entries'] = test_df.loc[
            test_df._merge == "left_only", uid_cols].to_dict()
    if test_df['_merge'].isin(["right_only"]).any():
        test_results['new entries'] = test_df.loc[
            test_df._merge == "right_only", uid_cols].to_dict()
    if len(out_df) != len(in_df):
        test_results['missing or extra uids'] = "fail"
    pt.verify_metric_total(in_df, out_df, metric, "mapping test")
    tests.checkTestResults(test_results, 'validate mapping', displayOnly=False)
예제 #10
0
def manage_split(df, metric_name, uid_cols, this_dataset):
    ''' Converts age and sex categories in the df to those used by the cancer prep process.
        1) Adds obs number
        2) Splits aggregated ages
        3) Combines disaggregated ages
        4) Splits unknown age category
        5) Splits aggregated/unknown sex category
    '''
    is_pop = bool(metric_name == "pop")
    df[metric_name].fillna(value=0, inplace=True)
    split_df = df.copy()
    # add observation number by group, without age
    uids_noAge = [c for c in uid_cols if 'age' not in c]
    split_df = core.add_missing_ages(split_df, uids_noAge, metric_name)
    obs_numbers = split_df[uids_noAge].drop_duplicates()
    obs_numbers['obs'] = obs_numbers.reset_index().index
    split_df = split_df.merge(obs_numbers)

    uid_cols.append('obs')
    # generate cause_weight
    if is_pop:
        cause_wgts = pp.gen_pop_wgts("age_wgt", df['location_id'].unique())
    else:
        # create weights used for splitting
        cause_wgts = pp.create_metric_weights(split_df, uid_cols, this_dataset)

        # collapse to get one weight per observation. dropping redundant entries beforehand 
        cause_wgts.drop_duplicates(subset=['obs','age','sex_id'], inplace=True)
        cause_wgts = dft.collapse(cause_wgts, by_cols=['obs', 'age', 'sex_id'],
                                  func='sum', combine_cols='wgt')
    # split
    if pt.has_nonStdAge(split_df):
        print("      splitting non-standard age...")
        split_df = core.split_age(dataset=split_df,
                                  wgt_df=cause_wgts,
                                  metric=metric_name,
                                  uid_cols=uid_cols)
    # collapse remaining under5 and 80+ ages
    # split_df =  md.collapse_youngAndOld_ages(split_df, uid_cols, metric_name)
    # redistribute "unknown age" data according to the current distribution of cases/deaths
    if pt.has_age_unk(split_df, metric_name):
        print("      splitting unknown age...")
        # create weights
        split_df = core.split_unknown_age(dataset=split_df,
                                          wgt_df=cause_wgts,
                                          metric=metric_name,
                                          uid_cols=uid_cols)
    # check for errors. If more than 3 metric totals are greater than .0001% different, alert user
    at.compare_pre_post_split(split_df, df, metric_name)
    # split sex = 3 and sex = 9 data
    if pt.has_combinedSex(split_df):
        print("      splitting sex...")
        if metric_name == "pop":
            sex_split_prop = pp.gen_pop_wgts(
                "sex_wgt", df['location_id'].unique())
        else:
            sex_split_prop = pp.create_sex_weights(cause_wgts,
                                                   uid_vars=['obs', 'age'],
                                                   metric=metric_name)
        split_df = core.split_sex(split_df,
                                  sex_split_prop,
                                  uid_cols,
                                  metric=metric_name)
    # collapse remaining underu5 and 80+ ages
    # final_df =  md.collapse_youngAndOld_ages(split_df, uid_cols, metric_name)
    final_df = split_df.copy(deep=True)
    final_df = final_df.loc[~final_df['age'].isin([26]), :]
    # collapse to incorperate newly-split data
    output_uids = md.get_uid_cols(5, is_pop)
    final_df = dft.collapse(final_df,
                            by_cols=output_uids,
                            func='sum',
                            combine_cols=metric_name
                            )
    # save and exit
    md.complete_prep_step(final_df, this_dataset, is_pop)
    return(None)
예제 #11
0
파일: run_sr.py 프로젝트: cheth-rowe/ihmexp
def submit_sr(calc_df, this_dataset):
    ''' Splits data based on subtotal-recalculation requirement and submits
            jobs as needed to recalculate subtotals. Then returns a re-combined
            dataset with subtotals recalculated
    '''
    def submission_req(df, uid): 
        ''' Returns boolean indicating whether data are to be submitted, 
                qualified by whether subtotals are present and whether any 
                component codes exist that could enable recalculation
        '''
        uid_test = df[df['uniqid'].eq(uid)]
        meets_requirement = bool( has_subtotals(uid_test, 'orig_cause')
                    and components_present(uid_test) )
        return(meets_requirement)

    def output_file_func(id):
        ''' Function fed to get_results relative to the  
        '''
        return(get_sr_file(this_dataset, 'split_output', id[0]))
    
    #
    output_uids = md.get_uid_cols(3)
    metric_name = this_dataset.metric
    job_header = "cnSR_{}_{}".format(dataset_id, data_type_id)
    sr_input_file = get_sr_file(this_dataset, "sr_input")
    worker_script = utils.get_path("subtotal_recalculation_worker",
                                                        process="mi_dataset")
    # convert components to string to enable save in hdf file
    uniqid_map = calc_df[output_uids + ['uniqid', 'orig_cause']
                         ].copy().drop_duplicates()
    submitted_data, unsubmitted_data = cup.split_submission_data(calc_df, 
                                        group_id_col='uniqid',
                                        submission_requirement=submission_req, 
                                        hdf_file=sr_input_file,
                                        regenerate_hdf=False)
    if len(submitted_data) == 0:
        final_results = unsubmitted_data
    else:
        uid_list = submitted_data['uniqid'].unique().tolist()
        sr_jobs = cup.generate_prep_workers(worker_script,
                                    list_of_uids=uid_list,
                                    ds_instance=this_dataset,
                                    job_header=job_header,
                                    is_resubmission=is_resubmission)
        output_files = cup.get_results(sr_jobs, 
                                    output_file_func,
                                    parent_process_name="sr",
                                    noisy_checker=True,
                                    add_resubmission_argument=is_resubmission,
                                    wait_time=5)
        # Re-combine compiled results with the set-aside data, before collapsing
        #   and testing
        results = pe.read_files(output_files)
        results.rename(columns={'cause':'orig_cause','codes_remaining':'cause'},
                       inplace=True)
        results = md.stdz_col_formats(results, additional_float_stubs='uniqid')
        results = results.merge(uniqid_map, how='outer', indicator=True)
        assert results['_merge'].isin(["both", "right_only"]).all(), \
            "Error merging with uids"
        del results['_merge']
        # entries with blank "cause" could not be corrected. replace with the 
        #   original aggregate (will be handled by cause recalculation and rdp).
        results.loc[results['cause'].eq(""), 'cause'] = results['orig_cause']
        #  drop causes that were zeroed in subtotal recalculation 
        results['total'] = results.groupby(output_uids)[metric_name].transform(sum)
        results = results.loc[results['total'].ne(0) &
                                results[metric_name].notnull(), :]
        final_results = results.append(unsubmitted_data)
    # Re-combine with data that were not split
    final_results = dft.collapse(final_results, by_cols=output_uids,
                                    combine_cols=this_dataset.metric)
    return(final_results)
예제 #12
0
def map_data(input_df, this_dataset):
    ''' Returns the input dataframe with attached cause mapping
        Process:
            1) Declares mapping order, then applies maps in that order
                NOTE: there will be no need to iterate this step once 
                    custom codes are mapped at the input stage
            2) Verifies map application 
    '''
    print("Applying maps...")
    # Define the order in which maps will be merged with the data
    map_order = {1: 'dataset_id', 2: 'country_id', 3: 'coding_system'}

    # Define the order in which maps will be merged with the data
    merge_cols = ['coding_system', 'cause', 'cause_name']
    uid_cols = [c for c in md.get_uid_cols(3) if c not in merge_cols]
    # Prepare for merge with cause map
    unmapped_df = md.add_location_ids(input_df)

    check_cols = [
        'cause', 'cause_name', 'registry_index', 'sex_id', 'year_start',
        'year_end'
    ]
    check_duplicates(unmapped_df, check_cols)

    unmapped_df['dataset_id'] = this_dataset.dataset_id
    # Generate inputs for iterative mapping
    cause_map = cm.load_cause_map(this_dataset.data_type_id)
    map_cols = ['coding_system', 'cause', 'cause_name', 'gbd_cause'] +\
                [a for a in cause_map.columns if 'acause' in a]
    mapped_df = pd.DataFrame()
    # Apply maps for CUSTOM coding systems.
    #   Note that this should run once with only 'coding_system' used to merge
    for o in map_order.values():
        subset_cols = list(
            set(map_cols +
                [o]))  # Ensures that each column appears only once in the list
        subset_entries = (cause_map.coding_system.str.contains("CUSTOM")
                          & ~pd.isnull(cause_map[o]) & (cause_map[o] != 0))
        map_subset = cause_map.loc[subset_entries, subset_cols]
        if len(map_subset) > 0:
            # Note: index is reset and then set to preserve index values of the unmapped_df dataframe
            #       in the newly_mapped dataframe
            newly_mapped = unmapped_df.reset_index().merge(
                map_subset, how='inner',
                on=merge_cols + [o]).set_index('index')
            mapped_df = mapped_df.append(newly_mapped)
            unmapped_df.drop(list(newly_mapped.index), axis=0, inplace=True)
            cause_map.drop(list(map_subset.index), axis=0, inplace=True)
    # Apply maps by non-CUSTOM coding_system
    final_map = cause_map[map_cols]
    # capitalizes ICCC3 codes in cancer_map to match the causes
    final_map.loc[final_map['coding_system'] == "ICCC3",
                  'cause'] = final_map.loc[final_map['coding_system'] ==
                                           "ICCC3", 'cause'].str.title()

    unmapped_df['test'] = unmapped_df.index
    newly_mapped = unmapped_df.reset_index().merge(
        final_map, how='inner', on=merge_cols).set_index('index')
    check_duplicates(newly_mapped, check_cols)
    mapped_df = mapped_df.append(newly_mapped)
    unmapped_df.drop(list(newly_mapped.index), axis=0, inplace=True)
    # Test output
    assert len(unmapped_df) == 0, \
        "ERROR: could not map all causes. Missing the following: \ncause: {} \ncause_name: {}".format(
            unmapped_df.cause.unique(), unmapped_df.cause_name.unique()
        )
    mt.validate_mapping(input_df, mapped_df, this_dataset.metric)

    # special consideration for Germ cell tumors, trophoblastic tumors, and neoplasms of gonads (ICCC code: X)
    # dropping both sexes for now until we have proportions set up
    mapped_df.loc[(mapped_df['acause1'] == "gender_specific") &
                  (mapped_df['sex_id'] == 2),
                  ['gbd_cause', 'acause1']] = "neo_ovarian"
    mapped_df.loc[(mapped_df['acause1'] == "gender_specific") &
                  (mapped_df['sex_id'] == 1),
                  ['gbd_cause', 'acause1']] = "neo_testicular"

    # Replace non-icd causes with "CUSTOM"
    # NOTE: this section should be moved to a different script or deleted altogether
    mapped_df.at[~mapped_df.coding_system.isin(["ICD10", "ICD9_detail"]),
                 "coding_system"] = "CUSTOM"
    mapped_df.drop(labels=['country_id', 'dataset_id', 'location_id'],
                   axis=1,
                   inplace=True)
    # Return
    return (mapped_df)