Python MI_Dataset Examples, cancer_estimation.a_inputs.a_mi_registry.mi_dataset.MI_Dataset Python Examples

Example #1

0

Show file

File: run_mapping.py Project: cheth-rowe/ihmexp

def main(dataset_id, data_type_id):
    ''' Maps data with the following steps:
            1) imports input data
            2) runs mapping function
            3) expands icd codes to fill garbage acause(n) where necessary
            4) applies sex and cause restrictions
    '''
    this_dataset = md.MI_Dataset(dataset_id, 3, data_type_id)
    metric_name = this_dataset.metric
    uid_cols = md.get_uid_cols(3)
    input_data = this_dataset.load_input().reset_index(drop=True)
    input_data.loc[input_data['im_frmat_id'].isnull()
                   & input_data['frmat_id'].eq(9), 'im_frmat_id'] = 9
    df = md.stdz_col_formats(input_data)
    #  Ensure that there is no "all age" data. Remove this line after updating
    #   the preceeding steps to stop using the old cancer age formats
    df = df.loc[df['age'] != 1, :]
    # Map data and apply restrictions
    mapped_df = map_data(df, this_dataset)
    restricted_df = cm.restrict_causes(mapped_df,
                                       cause_col='gbd_cause',
                                       data_type_id=data_type_id,
                                       restrict_age=False)
    md.complete_prep_step(restricted_df, this_dataset)
    print("mapping complete.\n")

Example #2

0

Show file

def main(dataset_id, data_type_id):
    '''
    '''
    # load input dataset
    this_dataset = md.MI_Dataset(dataset_id, 4, data_type_id)
    # Split population
    run_split_pop(this_dataset)
    run_split_metric(this_dataset)
    print("Age and Sex are Split.")

Example #3

0

Show file

def main(dataset_id, data_type_id, split_num):
    '''
    '''
    # Load input
    metric_dict = {'2': 'cases', '3': 'deaths'}
    this_dataset = md.MI_Dataset(dataset_id, 6, data_type_id)
    metric_name = this_dataset.metric
    rdp_input = manager.get_rdp_file(this_dataset, 'rdp_input')
    input_data = pd.read_hdf(rdp_input, 'split_{}'.format(split_num))

    # rename sex_id until rdp packages names are updated
    input_data.rename(columns={'sex_id': 'sex'}, inplace=True)

    # Redistribute data where possible
    if not manager.needs_rdp(input_data, this_dataset):
        print("    no redistribution needed for ds {} type {} split {}".format(
            dataset_id, data_type_id, split_num))
        save_worker_output(input_data, this_dataset, split_num)
        return (input_data)
    else:
        print("    redistributing ds {} type {} split {}".format(
            dataset_id, data_type_id, split_num))
        # Add maps to enable RDP
        input_data.rename(columns={'uid': 'split_group'}, inplace=True)
        mapped = add_location_hierarchy_info(input_data)
        # RDP cannot run without location metadata, and should not run for hiv
        #   Set aside those data
        skip_rdp_mask = cannot_redistribute(mapped)
        set_aside = mapped.loc[skip_rdp_mask, input_data.columns.tolist()]
        to_redistribute = mapped.loc[~skip_rdp_mask, :]
        # Redistribute remaining data
        if to_redistribute.any().any():
            rdp_results = run_rdp_core(to_redistribute, this_dataset,
                                       split_num)
            # Recombine
            if set_aside.any().any():
                rdp_results = rdp_results.append(set_aside, ignore_index=True)
            to_finalize = rdp_results
        else:
            print("    No data to redistribute. Finalizing.")
            to_finalize = input_data.rename(columns={'cause': 'acause'})
        output_cols = md.get_uid_cols(7)
        # rename sex_id until rdp packages get updated
        output_cols = ['sex' if x == 'sex_id' else x for x in output_cols]

        to_finalize = cm.correct_causes(to_finalize)
        finalized_df = dft.collapse(to_finalize,
                                    by_cols=output_cols,
                                    stub=metric_name)
        # Check totals (note: because of data precision, data before and after
        #   may not be precisely equivalent)
        diff = finalized_df[metric_name].sum() - input_data[metric_name].sum()
        assert abs(diff/input_data[metric_name].sum()) < 5, \
                    "Difference from input after rdp is too large"
        save_worker_output(finalized_df, this_dataset, split_num)
        return (finalized_df)

Example #4

0

Show file

File: run_sr.py Project: cheth-rowe/ihmexp

def main(dataset_id, data_type_id, is_resubmission):
    ''' Tests data for existence of subtotals and recalculates as possible 
            and necessary
    '''
    print(utils.display_timestamp())
    this_dataset = md.MI_Dataset(dataset_id, 2, data_type_id)
    df = reformat_input(this_dataset.load_input(), this_dataset)
    cleaned_input = remove_allCancer(df)

    # ICCC code and subcode removals GBD2019
    if 'ICCC3' in cleaned_input['coding_system'].unique():
        iccc_recoded = cleaned_input.copy()
        iccc_codes = get_iccc_code_sets()
        for main_code in iccc_codes.keys():
            iccc_recoded = remove_ICCC_codes(iccc_recoded, iccc_codes[main_code], main_code, data_type_id)
        # quick fix for subcodes with 1, 2, 3.. at the end by removing them
        remove_sub_subcodes = iccc_recoded[(iccc_recoded['coding_system'].eq("ICCC3")) & (iccc_recoded['cause'].str.isalpha())]
        orig_code = iccc_recoded[~iccc_recoded['coding_system'].eq("ICCC3")]
        cleaned_input = pd.concat([orig_code, remove_sub_subcodes])

    if not has_subtotals(cleaned_input, 'cause'):
        md.complete_prep_step(cleaned_input, this_dataset)
        print("subtotal_recalculation complete")
        return(None)
    else:
        non_icd = cleaned_input[~cleaned_input.coding_system.eq("ICD10")]
        icd_total = cleaned_input[cleaned_input.coding_system.eq("ICD10")]
        # Attach cause components to the cleaned input
        calc_df = icd_total.rename(columns={'cause': 'orig_cause'})
        sub_causes = code_components.run(calc_df) 
        calc_df = calc_df.merge(sub_causes)
        assert len(calc_df) > len(icd_total), \
            "Errorr during merge with subcauses"  
        # Test until a uid is discovered that reqires recalculation
        print("Verifying whether data can be recalculated...")
        any_components = False
        for u in calc_df['uniqid'].unique():
            if components_present(calc_df[calc_df['uniqid'].eq(u)]):
                any_components = True 
                break
            else:
                pass
        # Run recalculation only if necessary. Otherwise, output the cleaned data
        if any_components:
            # keep track of datasets with subtotal recalculation needed
            result_df = submit_sr(calc_df, this_dataset)
            subtotals_recalculated = validate_sr(result_df, this_dataset)
            subtotals_recalculated = pd.concat([non_icd, subtotals_recalculated])
        else:
            print("No recalculation necessary.")
            subtotals_recalculated = cleaned_input
            
        md.complete_prep_step(subtotals_recalculated, this_dataset)
        print(utils.display_timestamp())
        print("Subtotal_recalculation complete.")
        return(None)

Example #5

0

Show file

def main(dataset_id, data_type_id):
    '''
    '''
    # prep_step 5 = cause_disaggregation
    this_dataset = md.MI_Dataset(dataset_id, 5, data_type_id)
    input_data = this_dataset.load_input()
    metric = this_dataset.metric
    uid_cols = md.get_uid_cols(5)
    input_data = input_data.loc[~input_data['age'].isin(
        [26, 3, 4, 5, 6, 91, 92, 93, 94]), :]
    # Format and add observation numbers
    formatted_input = prep_for_disagg(input_data.copy(), uid_cols, metric)
    # Disaggregate
    disaggregated_df = core.disaggregate_acause(formatted_input, this_dataset)
    # update uid columns to account for reshaped acause
    uid_cols = [u for u in uid_cols if 'acause' not in u] + ['acause']
    #
    kaposi_df = core.redist_kaposi(disaggregated_df, metric, uid_cols)
    if data_type_id == 2:
        adjusted_df = core.redist_nmsc_gc(kaposi_df, metric)
    else:
        adjusted_df = kaposi_df
    final_df = core.map_remaining_garbage(adjusted_df, data_type_id)
    # run test functions and save output
    pt.verify_metric_total(input_data, adjusted_df,
                           metric, "cause disaggregation module")
    # collapse to incorperate newly-split data
    output_uids = md.get_uid_cols(6)
    final_df = md.stdz_col_formats(final_df)
    final_df = dft.collapse(final_df,
                            by_cols=output_uids,
                            func='sum',
                            combine_cols=metric
                            )
    # save
    md.complete_prep_step(final_df, this_dataset)
    print("Acause disaggregated")

Example #6

0

Show file

File: remove_subtotals.py Project: cheth-rowe/ihmexp

def run(dataset_id, data_type_id, uid):
    ''' Preps data for recalculation then recalculates as necessary
    '''
    this_dataset = md.MI_Dataset(dataset_id, 2, data_type_id)
    dataset_name = this_dataset.name
    metric = this_dataset.metric
    input_file = run_sr.get_sr_file(this_dataset, "sr_input")
    # Exit if output already exists
    output_file = run_sr.get_sr_file(this_dataset, 'split_output', uid)
    print(output_file)
    if os.path.exists(output_file):
        print("     output file found for uid " + str(uid))
        return (None)
    #
    negative_data_ok = is_exception(dataset_id, data_type_id)
    error_folder = utils.get_path("mi_input", base_folder='j_temp')
    subcause_issue_file = '{}/subcause_issue/{}_{}_uid_{}.txt'.format(
        error_folder, dataset_name, data_type_id, uid)
    exception_file = '{}/negative_data/{}_{}_uid_{}.csv'.format(
        error_folder, dataset_name, data_type_id, uid)
    for d in [subcause_issue_file, exception_file, error_folder]:
        utils.ensure_dir(d)
    #
    print("    removing subtotals from uid {}...".format(uid))
    # add data for the given uid
    df = pd.read_hdf(input_file, 'split_{}'.format(uid))
    # Create a list of possible codes so that decimal subcauses are only added
    #   if available
    input_cause_list = sorted(df['orig_cause'].unique().tolist())
    # create a dictionary for codes in the selected uid and attach the uid's
    #   data
    uid_subset = {}
    input_data = {}
    # process decimals first and ranges last to ensure that nested causes are
    #   removed
    for c in sorted(df['orig_cause'].unique().tolist()):
        uid_subset[c] = {}
        input_data[c] = {}
        uid_subset[c]['codes'] = []
        uid_subset[c]['subcodes'] = []
        if "-" not in c and "," not in c:
            uid_subset[c]['codes'].append(c)
            # add subcodes to 'subcode' key
            df.loc[df['orig_cause'].eq(c), 'cause'].dropna().unique().tolist()
            for subcode in sorted(df['cause'].where(
                    df['orig_cause'] == c).dropna().unique().tolist()):
                if subcode != c:
                    uid_subset[c]['subcodes'].append(subcode)
            # if none of the subcodes appear in the list, set the cause as a
            #   subcode of itself (prevents the addition of unused decimal
            #   causes)
            if not len(uid_subset[c]['subcodes']):
                uid_subset[c]['subcodes'] = uid_subset[c]['codes']
            elif (not any('{}.'.format(sub[:3]) in check
                          for check in input_cause_list
                          for sub in uid_subset[c]['subcodes'])):
                uid_subset[c]['subcodes'] = uid_subset[c]['codes']
        else:
            for code in sorted(df['cause'].where(
                    df['orig_cause'].eq(c)).dropna().unique().tolist()):
                uid_subset[c]['codes'].append(code)
                uid_subset[c]['subcodes'].append(code)
        # create other lists associated with the cause and add the metric data
        uid_subset[c]['subcauses_remaining'] = []
        uid_subset[c]['codes_removed'] = []
        uid_subset[c]['causes_removed'] = []
        uid_subset[c]['data'] = df.loc[df['cause'].eq(c),
                                       ['age', metric]].set_index('age')
        input_data[c]['data'] = uid_subset[c]['data']
        input_data[c]['codes'] = uid_subset[c]['codes']
    # Determine subcauses and highest number of causes remaining (how many
    #   subcauses are contained within each cause)
    uid_set = set_subcauses(uid_subset, subcause_issue_file)
    highest_level = determine_highest_level(uid_set)
    # remove lowest level codes from parent causes
    if highest_level == 0:
        print('     no subcauses present.')
    else:
        subcauses_removed = True
        while subcauses_removed:
            uid_set, subcauses_removed = remove_subcauses(
                uid_set, uid, exception_file)
            # remove duplicates
            uid_set = remove_duplicates(uid_set)
            # re-set subcauses and num_subcause_remaining
            uid_set, highest_level = set_subcauses(
                uid_set,
                subcause_issue_file,
            )
            print("     subcauses removed.")
    # Prepare Output
    print("saving output...")
    output = pd.DataFrame(
        columns=['cause', 'codes_remaining', 'codes_removed', 'age', metric])
    for c in uid_set:
        # format cause information
        cause_data = pd.DataFrame(
            columns=['cause', 'codes_remaining', 'codes_removed'])
        cause_data.loc[0, ['cause']] = c
        # if nothing was removed, or there was only a single cause, or all of
        #   the input codes are still present, set the codes remaining as the
        #   cause
        if (not len(uid_set[c]['codes_removed'])
                or ("-" not in c and "," not in c)
                or set(input_data[c]['codes']) <= set(uid_set[c]['codes'])):
            cause_data.loc[0, ['codes_remaining']] = c
        else:
            cause_data.loc[0, ['codes_remaining']] = ','.join(
                convert_to_range(uid_set[c]['codes']))
        cause_data.loc[0, ['codes_removed']] = ','.join(
            convert_to_range(uid_set[c]['codes_removed']))
        # format output data
        output_data = uid_set[c]['data']
        output_data['age'] = output_data.index
        output_data['cause'] = c
        orig_data = input_data[c]['data']
        orig_data['age'] = orig_data.index
        orig_data = orig_data.rename(columns={metric: 'orig_metric_value'})
        orig_data['cause'] = c
        # combine and add to output
        final = pd.merge(output_data, cause_data, on='cause')
        final = pd.merge(final, orig_data, on=['cause', 'age'])
        output = output.append(final)
    # Create output dataset
    output['uniqid'] = uid
    # Update encoding (bug fix to work around pandas to_stata issue)
    output = md.stdz_col_formats(output, additional_float_stubs='uniqid')
    # Export results
    output.to_csv(output_file, index=False)
    print('\n Done!')
    time.sleep(1)
    return (None)