Python collapse Examples, utils.data_format_tools.collapse Python Examples

Example #1

0

Show file

File: adjust_and_finalize.py Project: zhouxm4/ihme-modeling

def save_procedure_inputs(df, acause, location_id):
    '''' Formats and saves procedure data for upload into the epi database
    '''
    uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id']
    draw_cols = nd.get_columns("draw_cols")
    epi_estimate_cols = ['mean', 'lower', 'upper']
    data = df.loc[:, uid_cols + draw_cols].copy()
    # apply formatting
    data.loc[df['age_group_id'].isin([33, 44, 301]), 'age_group_id'] = 235
    data = dft.collapse(data, by_cols=uid_cols, stub='draw')
    epi_df = epi_upload.format_draws_data(data)
    epi_df = epi_upload.convert_to_rate(epi_df, epi_estimate_cols, location_id)
    # Add metadata
    epi_df['measure'] = 'incidence'
    epi_df['unit_type'] = "Person*year"
    epi_df['extractor'] = getuser()
    epi_df['location_id'] = location_id
    # Finalize and export
    for me_id in epi_df['modelable_entity_id'].unique():
        print("me_id " + str(me_id) + " sequela split")
        me_table = nd.load_me_table()
        bundle_id = int(me_table.loc[me_table['modelable_entity_id'].eq(me_id),
                                     'bundle_id'].item())
        this_output = epi_df.loc[epi_df['modelable_entity_id'].eq(me_id), :]
        this_output = epi_upload.EpiUploadDataframe(this_output).data
        # Save output without testing (epi formatter has already tested data per
        #   epi specs)
        # add location_id to enable save_outputs
        this_output['location_id'] = location_id
        nd.save_outputs("dismod_inputs",
                        this_output,
                        acause,
                        bundle_id,
                        skip_testing=True)

Example #2

0

Show file

File: run_cause_disaggregation.py Project: zhouxm4/ihme-modeling

def prep_for_disagg(df, uid_cols, metric):
    ''' returns dataset with updated coding systems and re-combines data in 
            preparation for cause disaggregation
    '''
    df.loc[df.coding_system != "ICD9_detail", 'coding_system'] = 'ICD10'
    df = dft.collapse(df, by_cols=uid_cols, func='sum', stub=metric)
    return (df)

Example #3

0

Show file

File: adjust_and_finalize.py Project: zhouxm4/ihme-modeling

def calc_total_prevalence(df, uid_cols):
    ''' Calculates a prevalence "total" value to be uploaded for troubleshooting
    '''
    sum_df = df.loc[df['me_tag'].isin([
        'primary_phase', 'controlled_phase', 'metastatic_phase',
        'terminal_phase'
    ])]
    sum_df.loc[:, 'me_tag'] = "computational_total"
    sum_df = dft.collapse(sum_df, by_cols=uid_cols, stub='prev')
    return (df.append(sum_df))

Example #4

0

Show file

File: rdp_worker.py Project: zhouxm4/ihme-modeling

def update_redistributed_acause(df, ds_instance, split_num):
    ''' Returns dataframe (df) after merging with maps to update cause information
        -- Maps:
            decimal cause map : used to revert cause names to decimal form
            cause map : used to validate output causes
    '''
    metric_name = ds_instance.metric
    output_uids = md.get_uid_cols(7)

    #

    def manage_rdp_remnants(df, temp_folder, split_num, metric):
        ''' Verifies if any garbage remains after
        '''
        # Get any codes that didn't merge and save them
        rdp_error = ((df['acause'].isnull() | (df['_merge'] == 'left_only'))
                     & df[ds_instance.metric].isin([np.nan, 0]))
        rdp_error_list = sorted(df.loc[rdp_error, 'cause'].unique().tolist())
        if len(rdp_error_list):
            print("The following causes are not in the cause map:")
            print(rdp_error_list)
        return (None)

    # Convert acause back to cancer cause
    code_format_updates = {
        'C0': 'C00',
        'C1': 'C01',
        'C2': 'C02',
        'C3': 'C03',
        'C4': 'C04',
        'C4A': 'C04',
        'C5': 'C05',
        'C6': 'C06',
        'C7': 'C07',
        'C8': 'C08',
        'C9': 'C09',
        'neo_other': 'neo_other_cancer'
    }
    for key, value in code_format_updates.items():
        df.loc[df['acause'] == key, 'acause'] = value
    # Merge with cause map
    df.rename(columns={'acause': 'cause'}, inplace=True)
    cause_map = cm.load_rdp_cause_map(ds_instance.data_type_id)
    df = df.merge(cause_map,
                  how='left',
                  on=['coding_system', 'cause'],
                  indicator=True)
    # Check that all data were mapped to cause
    manage_rdp_remnants(df, ds_instance.temp_folder, split_num, metric_name)
    # Reformat to merge data with original source
    df = df.loc[:, output_uids + [metric_name]]
    final_df = dft.collapse(df, by_cols=output_uids, stub=metric_name)
    return (final_df)

Example #5

0

Show file

def submit_rdp(input_data, this_dataset, is_resubmission):
    ''' Returns full dataset after redistribution.
        Separates data by submission requirement before submitting rdp for only
        only those data that require it
    '''
    def submission_requirement(df, uid):
        return needs_rdp(df[df['uid'] == uid], this_dataset)

    def output_file_function(id):
        return get_rdp_file(this_dataset,
                            which_file='split_output',
                            splitNum=id[2])

    # create a list of the uids that require redistribution and set aside a
    #   dataframe of the uids that do not require redistribution
    rdp_code_location = utils.get_path("redistribution",
                                       base="code_repo",
                                       process="mi_dataset")
    worker_script = rdp_code_location + "/rdp_worker.py"
    output_uids = md.get_uid_cols(7)
    header = "cncRDP_{}_{}".format(this_dataset.dataset_id,
                                   this_dataset.data_type_id)
    rdp_input_file = get_rdp_file(this_dataset, which_file='rdp_input')
    #
    prepped_df = prep_input(input_data, this_dataset)
    submitted_data, unsubmitted_data = cup.split_submission_data(
        prepped_df, 'uid', submission_requirement, rdp_input_file)
    uid_list = submitted_data['uid'].unique().tolist()
    rdp_job_dict = cup.generate_prep_workers(worker_script,
                                             list_of_uids=uid_list,
                                             ds_instance=this_dataset,
                                             job_header=header,
                                             is_resubmission=is_resubmission,
                                             pace_interval=0.05)
    output_files = cup.get_results(rdp_job_dict,
                                   output_file_function,
                                   parent_process_name="rdp",
                                   noisy_checker=is_resubmission,
                                   add_resubmission_argument=is_resubmission,
                                   wait_time=5)
    # Re-combine compiled results with the set-aside data, before collapsing
    #   and testing
    final_results = pe.append_files(output_files)
    final_results = final_results.append(unsubmitted_data)
    # Re-set all 'under 5' data, then collapse to combine it with any existing
    #       'under 5' data
    final_results.loc[final_results['age'].lt(7) |
                      (final_results['age'].gt(90)
                       & final_results['age'].lt(95)), 'age'] = 2
    final_results = dft.collapse(final_results,
                                 by_cols=output_uids,
                                 combine_cols=this_dataset.metric)
    return (final_results)

Example #6

0

Show file

File: rdp_worker.py Project: zhouxm4/ihme-modeling

def main(dataset_id, data_type_id, split_num):
    '''
    '''
    # Load input
    metric_dict = {'2': 'cases', '3': 'deaths'}
    this_dataset = md.MI_Dataset(dataset_id, 6, data_type_id)
    metric_name = this_dataset.metric
    rdp_input = manager.get_rdp_file(this_dataset, 'rdp_input')
    input_data = pd.read_hdf(rdp_input, 'split_{}'.format(split_num))
    # Redistribute data where possible
    if not manager.needs_rdp(input_data, this_dataset):
        print("    no redistribution needed for ds {} type {} split {}".format(
            dataset_id, data_type_id, split_num))
        save_worker_output(input_data, this_dataset, split_num)
        return (input_data)
    else:
        print("    redistributing ds {} type {} split {}".format(
            dataset_id, data_type_id, split_num))
        # Add maps to enable RDP
        input_data.rename(columns={'uid': 'split_group'}, inplace=True)
        mapped = add_location_hierarchy_info(input_data)
        # RDP cannot run without location metadata, and should not run for hiv
        #   Set aside those data
        skip_rdp_mask = cannot_redistribute(mapped)
        set_aside = mapped.loc[skip_rdp_mask, input_data.columns.tolist()]
        to_redistribute = mapped.loc[~skip_rdp_mask, :]
        # Redistribute remaining data
        if to_redistribute.any().any():
            rdp_results = run_rdp_core(to_redistribute, this_dataset,
                                       split_num)
            # Recombine
            if set_aside.any().any():
                rdp_results = rdp_results.append(set_aside, ignore_index=True)
            to_finalize = rdp_results
        else:
            print("    No data to redistribute. Finalizing.")
            to_finalize = input_data.rename(columns={'cause': 'acause'})
        output_cols = md.get_uid_cols(7)
        to_finalize = cm.correct_causes(to_finalize)
        finalized_df = dft.collapse(to_finalize,
                                    by_cols=output_cols,
                                    stub=metric_name)
        # Check totals (note: because of data precision, data before and after
        #   may not be precisely equivalent)
        diff = finalized_df[metric_name].sum() - input_data[metric_name].sum()
        assert abs(diff/input_data[metric_name].sum()) < 0.01, \
                    "Difference from input after rdp is too large"
        save_worker_output(finalized_df, this_dataset, split_num)
        return (finalized_df)

Example #7

0

Show file

File: age_sex_core.py Project: zhouxm4/ihme-modeling

def apply_age_spilt_proportions(input_df, frmat_type, wgt_df, uid_cols,
                                metric):
    ''' combines weights with population to calculate proportions by which 
            combined age groups are to be split, then splits data by those
            proportions
    '''
    # remove dataset_id if present in dataframe
    split_input = input_df.copy()
    if 'dataset_id' in split_input.columns:
        del split_input['dataset_id']
    # merge with the age format map and get an expanded dataframe with the ages
    #   to be split
    uids_noAge = [u for u in uid_cols if 'age' != u]
    uid_cols = uids_noAge + ['age']
    marked_df = mark_ages_to_be_split(split_input, frmat_type, uid_cols,
                                      metric)
    to_expand = marked_df.loc[marked_df['to_expand'].eq(1), :].copy()
    if len(to_expand) == 0:
        return (split_input)
    # merge with expected values ("weights")
    to_expand.rename(columns={
        'age': 'split_age',
        'gbd_age': 'age'
    },
                     inplace=True)
    weighted_df = to_expand.merge(wgt_df)
    astest.test_weights(to_expand, weighted_df)
    # calculate proportions
    to_split = pp.add_proportions(weighted_df, uids_noAge + ['split_age'])
    # adjust by proportions
    to_split.loc[:, 'split_value'] = to_split[metric]
    to_split.loc[:, metric] = to_split['proportion'] * to_split['split_value']
    # collapse, then update format types of split data
    recombined_df = to_split.append(
        marked_df.loc[marked_df['to_expand'] == 0, :])
    adjusted_df = dft.collapse(recombined_df,
                               by_cols=uid_cols,
                               func='sum',
                               combine_cols=metric)
    astest.compare_pre_post_split(split_input, adjusted_df, metric)
    adjusted_df.loc[:, 'need_split'] = 1
    pt.verify_metric_total(split_input, adjusted_df, metric,
                           "apply age proportions")
    return (adjusted_df[split_input.columns.values])

Example #8

0

Show file

File: prevalence.py Project: zhouxm4/ihme-modeling

def calc_prevalence(sequela_framework, mort_df, acause):
    '''
    '''
    print("    calculating prevalence...")
    prev_cols = nd.get_columns('prevalence')
    mort_cols = nd.get_columns('mortality')
    surv_uids = nd.nonfatalDataset("survival", acause).uid_cols
    prev_uids = nd.nonfatalDataset("prevalence", acause).uid_cols
    # Create the prevalence estimation frame from the survival and mortality 
    #       frames
    mrg_df = pd.merge(sequela_framework, mort_df)
    df = mrg_df[surv_uids + ['me_tag']]
    # Calculate prevalence of each sequela by multiplying sequela duration
    #     by the number of people surviving for only that duration
    df[prev_cols] = mrg_df[mort_cols].mul(mrg_df['sequela_duration'], axis=0)
    df = dft.collapse(df, combine_cols=prev_cols,
                      by_cols=prev_uids, func='sum')
    df.loc[:, prev_cols] = df[prev_cols] / 12  # convert to years
    assert not df.isnull().any().any(), "Error in im_draw {}".format(i)
    return(df)

Example #9

0

Show file

def combine_uid_entries(df,
                        uid_cols,
                        metric_cols,
                        combined_cols=['NID', 'registry_index', 'dataset_id'],
                        collapse_metrics=True):
    ''' Preserves a list of all entries in the combined_cols before collapsing
            by uid_cols to calculate the sum of the metric_cols
        Returns a dataframe collapsed by uid_cols
        -- Inputs
            df : pandas dataframe
            uid_cols : list of uniquely identifying columns for the dataframe
            metric_cols : list of columns containing metric values for each uid
            combined_cols : list of columns whose values are to be combined into
                one tuple per uid
            collapse_metrics : set to False to prevent collapse after re-setting
                combined cols entries
    '''
    assert not df[uid_cols+combined_cols].isnull().any().any(), \
        "Cannot combine dataframe with null values in uid or combined columns"
    combined_cols = [c for c in combined_cols if c in df.columns]
    static_cols = [c for c in df.columns if c not in combined_cols]
    combined_entries = df[static_cols].copy()
    for col in combined_cols:
        new_entries = df[uid_cols + [col]].groupby(
            uid_cols,
            as_index=False)[col].agg(lambda c: tuple_unique_entries(c))
        new_entries.loc[:, col] = new_entries[col].astype(str)
        combined_entries = combined_entries.merge(new_entries,
                                                  on=uid_cols,
                                                  how='left')
        assert not combined_entries[col].isnull().any(), \
            "Error combining uids for column {}".format(col)
    if collapse_metrics:
        output = dft.collapse(combined_entries,
                              by_cols=uid_cols + combined_cols,
                              combine_cols=metric_cols,
                              func='sum')
    else:
        output = combined_entries
    return (output)

Example #10

0

Show file

File: run_cause_disaggregation.py Project: zhouxm4/ihme-modeling

def main(dataset_id, data_type_id):
    ''' Disaggregates uids that are mapped to multiple gbd causes, including
            garbage codes, Kaposi Sarcoma, and non-melanoma skin cancer
    '''
    # prep_step 5 = cause_disaggregation
    this_dataset = md.MI_Dataset(dataset_id, 5, data_type_id)
    input_data = this_dataset.load_input()
    metric = this_dataset.metric
    uid_cols = md.get_uid_cols(5)
    input_data = input_data.loc[
        ~input_data['age'].isin([26, 3, 4, 5, 6, 91, 92, 93, 94]), :]
    # Format and add observation numbers
    formatted_input = prep_for_disagg(input_data.copy(), uid_cols, metric)
    # Disaggregate
    disaggregated_df = core.disaggregate_acause(formatted_input, this_dataset)
    # update uid columns to account for reshaped acause
    uid_cols = [u for u in uid_cols if 'acause' not in u] + ['acause']
    #
    kaposi_df = core.redist_kaposi(disaggregated_df, metric, uid_cols)
    if data_type_id == 2:
        adjusted_df = core.redist_nmsc_gc(kaposi_df, metric)
    else:
        adjusted_df = kaposi_df
    final_df = core.map_remaining_garbage(adjusted_df, data_type_id)
    # run test functions and save output
    pt.verify_metric_total(input_data, adjusted_df, metric,
                           "cause disaggregation module")
    # collapse to incorperate newly-split data
    output_uids = md.get_uid_cols(6)
    final_df = md.stdz_col_formats(final_df)
    final_df = dft.collapse(final_df,
                            by_cols=output_uids,
                            func='sum',
                            combine_cols=metric)
    # save
    md.complete_prep_step(final_df, this_dataset)
    print("Acause disaggregated")

Example #11

0

Show file

def manage_split(df, metric_name, uid_cols, this_dataset):
    ''' Converts age and sex categories in the df to those used by the cancer prep process.
        1) Adds obs number
        2) Splits aggregated ages
        3) Combines disaggregated ages
        4) Splits unknown age category
        5) Splits aggregated/unknown sex category
    '''
    is_pop = bool(metric_name == "pop")
    #
    df[metric_name].fillna(value=0, inplace=True)
    split_df = df.copy()
    # add observation number by group, without age
    uids_noAge = [c for c in uid_cols if 'age' not in c]
    obs_numbers = split_df[uids_noAge].drop_duplicates()
    obs_numbers['obs'] = obs_numbers.reset_index().index
    split_df = split_df.merge(obs_numbers)
    uid_cols.append('obs')
    # generate cause_weights
    if is_pop:
        cause_wgts = pp.gen_pop_wgts("age_wgt", df['location_id'].unique())
    else:
        # create weights used for splitting
        cause_wgts = pp.create_metric_weights(split_df, uid_cols, this_dataset)
        # collapse to get one weight per observation
        cause_wgts = dft.collapse(cause_wgts, by_cols=['obs', 'age', 'sex'],
                                  func='sum', combine_cols='wgt')
    # split
    if pt.has_nonStdAge(split_df):
        print("      splitting non-standard age...")
        split_df = core.split_age(dataset=split_df,
                                  wgt_df=cause_wgts,
                                  metric=metric_name,
                                  uid_cols=uid_cols)
    # redistribute "unknown age" data according to the current distribution of 
    #   cases/deaths
    if pt.has_age_unk(split_df, metric_name):
        print("      splitting unknown age...")
        # create weights
        split_df = core.split_unknown_age(dataset=split_df,
                                          wgt_df=cause_wgts,
                                          metric=metric_name,
                                          uid_cols=uid_cols)
    # check for errors. 
    at.compare_pre_post_split(split_df, df, metric_name)
    # split sex = 3 and sex = 9 data
    if pt.has_combinedSex(split_df):
        print("      splitting sex...")
        if metric_name == "pop":
            sex_split_prop = pp.gen_pop_wgts(
                "sex_wgt", df['location_id'].unique())
        else:
            sex_split_prop = pp.create_sex_weights(cause_wgts,
                                                   uid_vars=['obs', 'age'],
                                                   metric=metric_name)
        split_df = core.split_sex(split_df,
                                  sex_split_prop,
                                  uid_cols,
                                  metric=metric_name)
    # collapse remaining underu5 and 80+ ages
    final_df = split_df.copy(deep=True)
    final_df = final_df.loc[~final_df['age'].isin([26]), :]
    # collapse to incorperate newly-split data
    output_uids = md.get_uid_cols(5, is_pop)
    final_df = dft.collapse(final_df,
                            by_cols=output_uids,
                            func='sum',
                            combine_cols=metric_name
                            )
    # save and exit
    md.complete_prep_step(final_df, this_dataset, is_pop)
    return(None)

Example #12

0

Show file

File: adjust_and_finalize.py Project: zhouxm4/ihme-modeling

def apply_procdedure_proportions(df, proportions, acause, metric_name):
    ''' Multiplies estimates by procedure proportions, adding to the dataframe
            a set of estimates for the number of cancer events that do not 
            recieve the given procedure
    '''
    print("    adjusting to avoid double-counting procedures...")
    # Return if adjustment is unnecessary (if there is no rate id for the cause)
    uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols
    draw_cols = nd.get_columns("draw_cols")
    type_cols = nd.get_columns(metric_name)
    mrg_cols = [c for c in uid_cols if c != 'me_tag']
    # Subset estimates to the phase wherein procedures occur
    if metric_name == 'prevalence':
        mrg_df = df.loc[df['me_tag'] == "controlled_phase", :].copy()
        del mrg_df['me_tag']
    elif metric_name == 'incidence':
        mrg_df = df.copy()
    # For data where sequela are a fraction of the number of procedures, multiply
    #       the procedure proportion by those fractions
    if metric_name == 'prevalence' and bool(sequelae_fractions(acause)):
        # Generate dataframe to containing the fractions
        fracs = pd.DataFrame().from_dict(sequelae_fractions(acause),
                                         orient='index')
        fracs['acause'] = acause
        fracs = fracs[~fracs['me_tag'].eq("procedure_sequelae")]
        # Merge dataframe with proportions to expand
        proportions['acause'] = acause
        props = proportions.merge(fracs)
        # Adjust proportions by me
        props[draw_cols] = props[draw_cols].multiply(props['fraction'],
                                                     axis='index')
        del props['acause']
    else:
        # Determine fraction of population that does not recieve the procedure
        props = proportions.copy()
        props['me_tag'] = "adjusted_controlled_phase_a"
    # Apply proportions to estimates
    #   Note: may drop some data if proportions are only for estimation years
    mrg_df = mrg_df.merge(props, on=mrg_cols, how='inner')
    adj_df = mrg_df[uid_cols]
    evnt_wo_proc = pd.DataFrame(mrg_df[type_cols].values *
                                mrg_df[draw_cols].values).fillna(0)
    evnt_wo_proc.columns = type_cols
    adj_df[type_cols] = evnt_wo_proc
    assert not adj_df.isnull().any().any(
    ), "Error calculating procedure proportions"
    # For prevalence, append the adjusted data to the rest of the estimates
    if metric_name == 'prevalence':
        sq_df = dft.collapse(adj_df, mrg_cols,
                             combine_cols=type_cols).sort_values(mrg_cols)
        cntrl_df = df.loc[df['me_tag'].eq("controlled_phase"), :].merge(
            mrg_df[mrg_cols].drop_duplicates(), on=mrg_cols,
            how='inner').sort_values(mrg_cols)
        nosq_df = cntrl_df[mrg_cols]
        no_proc = pd.DataFrame(cntrl_df[type_cols].values -
                               sq_df[type_cols].values)
        no_proc.columns = type_cols
        nosq_df[type_cols] = no_proc
        nosq_df['me_tag'] = "adjusted_controlled_phase"
        adj_df = adj_df.append(nosq_df)
        output_data = df.append(adj_df)
    # Incidence of cancers with the procedure is estimated elsewhere, so there
    #      is no need to preserve the unadjusted data
    else:
        output_data = adj_df
    return (output_data[uid_cols + type_cols])