def run_neonatal_split():
    '''
    '''
    print('running neonatal splitting...')
    # load input dataset
    this_dataset = pd.read_csv("{}/mortality_estimates.csv".format(
        utils.get_path("mortality_model", base_folder="storage")))

    # grabbing cancer age column temporaily
    this_dataset = gct.cancer_age_from_age_group_id(this_dataset)
    this_dataset.drop("age_group_id", axis=1, inplace=True)
    this_dataset.rename(columns={"dataset_id": "dataset_ids"}, inplace=True)
    old_df = this_dataset.copy()
    old_df = old_df[~old_df['age'].isin([2])]
    this_dataset = this_dataset[this_dataset.age.isin([2])]
    # Split 0-4
    death_df = run_split_metric(
        this_dataset[get_uid_columns() + ['deaths', 'pop']], 'deaths')
    pop_df = run_split_pop(this_dataset[get_uid_columns() + ['pop']], 'pop')
    # combine non-split data to death and population dataframe
    df_metric_final = combine_ages(old_df, death_df, 'deaths')
    df_pop_final = combine_ages(old_df, pop_df, 'pop')
    del df_metric_final['pop']  # remove pop from split_metric dataframe
    merge_cols = get_uid_columns()
    merge_cols.remove('age')
    merge_cols = merge_cols + ['age_group_id']
    df_final = pd.merge(df_metric_final,
                        df_pop_final,
                        how='inner',
                        on=merge_cols)
    # save file
    df_final.to_csv("{}/split_mortality_estimates.csv".format(
        utils.get_path("mortality_model", base_folder="storage")))
    return (df_final)
    print("Neo Ages are Split.")
    def compile_nonzero_floor(self, cmdf):
        '''
        For GBD2019, new floor values were generated for cancer causes that had 
        updated age restrictions, or was a new modeled cause. This function takes
        the original nonzero floor values, and appends all updated values 
        '''
        work_dir = utils.get_path(process='cod_mortality',
                                  key='nonzero_floor_workspace')
        orig_nzf = pd.read_csv(
            utils.get_path(process='cod_mortality', key='orig_nonzero_file'))

        # convert age_group_ids to comply with GBD's
        formatted_orig_nzf = self.convert_nonzero_mad(orig_nzf, cmdf)

        # load nonzero floor values with new age restrictions, and that were new causes
        # for this GBD cycle
        new_age_rstrct_df = pd.read_csv(
            '{}/nonzero_floor_new_age_restrictions.csv'.format(work_dir))
        new_causes_df = pd.read_csv(
            '{}/nonzero_new_causes.csv'.format(work_dir))

        # append all nonzero values together
        comp_nzf = formatted_orig_nzf.append(new_age_rstrct_df)
        comp_nzf = comp_nzf.append(new_causes_df)

        return comp_nzf
def run_noise_reduction_create_metrics(): 
    ''' Noise reduce datapoints based off of prior generated 
    '''
    prior_dir = utils.get_path(process='cod_mortality',key='NR_prior')
    prior_df = pd.read_csv(prior_dir)
    del prior_df['sample_size'] # remove sample_size 
    prior_df_cc = format_CoD_input.refine_by_cc_code(prior_df) 
    prior_df_cc.to_csv(prior_dir)   
    print('Creating noise reduction posterior...')
    script_path = utils.get_path(process='cod_mortality', key='noise_reduction_posterior')
    subprocess.call(['Rscript', script_path], shell=False)
def project_data():
    ''' Runs pipeline to combine previously-selected incidence data
        Requires incidence data that are unique by location_id-year-sex-age-acause
    '''
    input_file = utils.get_path("combined_incidence", process="cod_mortality")
    output_file = utils.get_path("projected_incidence",
                                 process="cod_mortality")
    df = pd.read_csv(input_file)
    df = project_to_special_locations(df)
    df = project_ihme_location_estimates(df)
    df.to_csv(output_file, index=False)
    print("incidence data projected")
Exemple #5
0
def run():
    ''' Runs pipeline to combine previously-selected incidence data
        Requires incidence data that are unique by location_id-year-sex-age-acause
    '''
    output_file = utils.get_path("combined_incidence", process="cod_mortality")
    input_file = utils.get_path("prepped_incidence", process="cod_mortality")
    utils.ensure_dir(output_file)
    df = pd.read_csv(input_file)
    print("Combining data to one single entry per uid...")
    df = combine_incidence(df)
    df.to_csv(output_file, index=False)
    return (df)
def run_noise_reduction_prior(): 
    ''' calculates CC_code then runs noise reduction 
    '''
    neonat_dir = utils.get_path(process='cod_mortality', key='neonatal_split')
    df = pd.read_csv(neonat_dir)
    split_cc_df = format_CoD_input.refine_by_cc_code(df) 
    split_cc_dir = utils.get_path(process='cod_mortality',key='mortality_split_cc')
    split_cc_df.to_csv(split_cc_dir)
    print('creating noise reduction prior...')
    script_path = utils.get_path(process='cod_mortality', key='noise_reduction_prior')
    subprocess.call(['Rscript', script_path], shell=False)
    return 
Exemple #7
0
def get_age_frmat_map(frmat_type):
    '''
    '''
    if frmat_type == "im_frmat_id":
        resource = pd.read_csv(utils.get_path(
            'im_frmat_map', process="mi_dataset"))
    elif frmat_type == "frmat_id":
        resource = pd.read_csv(utils.get_path(
            'frmat_map', process="mi_dataset"))
    resource = md.stdz_col_formats(
        resource, additional_float_stubs=['age_specific', 'age_split'])
    return(resource)
Exemple #8
0
def project_incidence():
    ''' For each IHME location_id, projects estimates based in the input cancer
        rates
        Includes generation of national estimates from subnational estimates
        where national estimates are not present 
    '''
    print("   projecting data to ihme demographic specifications...")
    output_file = utils.get_path("projected_incidence",
                                 process="cod_mortality")
    input_file = utils.get_path("combined_incidence", process="cod_mortality")
    pop_uids = [c for c in get_uid_columns() if c != 'acause']
    df = pd.read_csv(input_file)
    # define subset that can be projected to the IHME population
    df = modeled_locations.add_subnational_status(df)
    df = supplement_national_estimates(df)
    # Ensure validity of sdi_quintile
    df = modeled_locations.add_sdi_quintile(df, delete_existing=True)
    # Calculate rate of input
    df.loc[:, 'rate'] = df['cases'] / df['pop']
    df['registry_pop'] = df['pop']
    # Mark data to be projected
    project_to_ihme = (df['sdi_quintile'].eq(5))
    df_sdi5 = df.loc[project_to_ihme, :].copy()
    df_other = df.loc[~project_to_ihme, :].copy()
    # Add IHME population to applicable uids
    del df_sdi5['pop']
    ihme_pop = load_ihme_pop(
        list(df.loc[project_to_ihme, 'location_id'].unique()))
    df_sdi5 = df_sdi5.merge(ihme_pop)
    # Homogenize population by group where not applying IHME populations
    df_other = staging_functions.homogenize_pop(df_other, uid_cols=pop_uids)
    output = df_other.append(df_sdi5)
    # reindex to allow multiplying series
    # create new column index, then set that as the new index
    output['index'] = np.arange(len(output))
    output = output.set_index('index')
    # Broadcast rates to the final population estimate for all locations
    output.loc[output['pop'].notnull() & output['rate'].notnull()
               & ~output['rate'].eq(np.inf),
               'cases'] = output['rate'] * output['pop']
    # Drop registry-specific tags
    output = output.drop([
        'national_registry', 'full_coverage', 'is_subnational', 'registry_pop'
    ],
                         axis=1,
                         errors='ignore')
    assert not output.loc[output.duplicated(get_uid_columns()), :].any().any(), \
        "Duplicates exist after projection"
    assert not output['pop'].isnull().any(), "Missing population data"
    assert len(output) == len(df), "Error during estimate projection"
    output.to_csv(output_file, index=False)
    print("   data projected.")
    return (output)
Exemple #9
0
def save_splits(modelable_entity_id, cnf_model_version_id):
    ''' Launch jobs to upload each of the "split" modelable entities, generated 
            by splitting the parent modelable entity
    '''
    def output_file_func(id):
        return (nd.nonfatalDataset("upload", id).get_output_file('upload'))

    parent_me = modelable_entity_id
    work_dir = get_work_dir(parent_me, cnf_model_version_id)
    this_step = nd.nonfatalDataset("split", parent_me)
    success_file = this_step.get_output_file('upload')
    children_mes, skip_mes, me_tag = get_me_info(modelable_entity_id,
                                                 parent_me)
    measures = get_measures(me_tag)
    save_worker = utils.get_path('save_epi_worker', process='cancer_model')
    # Generate a list of arguments (one for each child me)
    description = "{}_run_{}".format(me_tag, cnf_model_version_id)
    save_args_template = ("--meid {meid} --meas_id {meas} --indir {input_dir}"
                          " --cnf_run_id {cnf_rid} --desc {desc}")
    save_arg_list = []
    for cm in children_mes:
        save_arg_list += [
            save_args_template.format(meid=cm,
                                      meas=" ".join([str(m)
                                                     for m in measures]),
                                      desc=description,
                                      input_dir="{}/{}".format(work_dir, cm),
                                      cnf_rid=cnf_model_version_id)
        ]
    # Start jobs
    job_dict = cluster_tools.create_jobs(script_path=save_worker,
                                         job_header="lvr_save_epi",
                                         memory_request=90,
                                         id_list=children_mes,
                                         script_args=save_arg_list,
                                         use_argparse=True,
                                         project_name="cancer")
    for i in job_dict:
        job_dict[i]['job'].launch()
    # Check for results
    job_description = str(modelable_entity_id) + " split upload"
    success_df = cluster_tools.wait_for_results(
        job_dict,
        jobs_description=job_description,
        noisy_checker=False,
        output_file_function=output_file_func,
        max_minutes=30)
    success = cluster_tools.validate_success(success_df, job_description)
    if success:
        success_df.to_csv(success_file, index=False)
        return (True)
    else:
        print("Error during split")
        return (False)
Exemple #10
0
def run(cod_mortality_version_id):
    ''' Runs drop_and_check, then prepares output for cod_mortality model pipeline
    '''
    drop_and_check = utils.get_path("drop_and_check_cod",
                                    process="cod_mortality")
    selected_incidence = utils.get_path("selected_incidence",
                                        process="cod_mortality")
    prepped_incidence = utils.get_path("prepped_incidence",
                                       process="cod_mortality")
    print("Selecting incidence data...")
    inc_id = gv.get_inc_version(cod_mortality_version_id)
    #pydo.do_stata(drop_and_check, arg_list=[inc_id])
    print("Formatting incidence for pipeline...")
    df = pd.read_stata(selected_incidence)
    df = format_incidence_input(df)
    df = add_required_columns(df)
    df = update_populations(df)
    df = restrict(df)
    df.to_csv(prepped_incidence, index=False)
    return (df)
Exemple #11
0
def generate_estimates(acause,
                       location_id,
                       cnf_model_version_id,
                       faux_correct,
                       is_resubmission=False):
    ''' Runs a subprocess that passes all arguments to the R script that 
            calculates incidence draws
    '''
    print('Beginning incidence estimation...')
    r_script = utils.get_path("calculate_incidence", process="nonfatal_model")
    cmd = "bash {shl} {scr} {ac} {l} {id} {fc} {rs}".format(
        shl=utils.get_path("r_shell"),
        scr=r_script,
        ac=acause,
        l=location_id,
        id=cnf_model_version_id,
        fc=faux_correct,
        rs=is_resubmission)
    subprocess.call(cmd, shell=True)
    return (True)
Exemple #12
0
def generate_code_index(input_codes):
    ''' Returns an index of all possible ICD10 codes with attached number 
            indicating order of appearance and tag for viability. "Viable" tag
            indicates either an official code or an unofficial code that exists 
            in the data
    '''
    if not isinstance(input_codes, tuple):
        input_codes = tuple(input_codes)
    # Import list of ICD10 codes and define code index
    code_list_path = (
        utils.get_path('mi_input') + "/_resources/" +
        "subtotal_recalculation/list_of_official_NUMERIC_ICD10_codes.csv")
    ICD10_code_list = pd.read_csv(code_list_path)
    ICD10_code_list.sort_values(by=['ICD10_code'], inplace=True)
    ICD10_code_list = tuple(ICD10_code_list['ICD10_code'])
    ICD10_code_index = {}
    order_num = 1
    for k in ['C', 'D']:
        under_10_alternate = [
            '00', '01', '02', '03', '04', '05', '06', '07', '08', '09'
        ]

        for o in under_10_alternate + list(range(10, 100)):
            kode = '{}{}'.format(k, o)
            ICD10_code_index[kode] = {}
            ICD10_code_index[kode]['order'] = order_num
            if kode in ICD10_code_list or kode in input_codes:
                ICD10_code_index['{}{}'.format(k, o)]['viable'] = True
            else:
                ICD10_code_index[kode]['viable'] = False
            order_num += 1

            for d in range(0, 10):
                kode = '{}{}.{}'.format(k, o, d)
                ICD10_code_index[kode] = {}
                ICD10_code_index[kode]['order'] = order_num
                if kode in ICD10_code_list or kode in input_codes:
                    ICD10_code_index['{}{}.{}'.format(k, o,
                                                      d)]['viable'] = True
                else:
                    ICD10_code_index[kode]['viable'] = False
                order_num += 1

                for e in range(0, 10):
                    kode = '{}{}.{}{}'.format(k, o, d, e)
                    ICD10_code_index[kode] = {}
                    ICD10_code_index[kode]['order'] = order_num
                    if kode in ICD10_code_list or kode in input_codes:
                        ICD10_code_index[kode]['viable'] = True
                    else:
                        ICD10_code_index[kode]['viable'] = False
                    order_num += 1
    return (ICD10_code_index)
Exemple #13
0
def load_package_set(df):
    ''' loads the package set linked to the coding system
    '''
    code_version = df.coding_system.unique()[0]
    params_dir = utils.get_path('mi_dataset_resources', process="mi_dataset")
    package_path = params_dir + '/redistribution/packagesets_{}.dta'.format(
        code_version)
    package = pd.read_stata(package_path)
    assert len(package) == 1, "Incorrect number of source labels in "\
        "packagesets_{}. Expected 1, got {}. Redistribution failed."\
        .format(code_version, len(package))
    return (package.package_set_id.unique()[0])
Exemple #14
0
def run(input_data, PACKAGE_MAP, TEMP_FOLDER):
    '''
    '''
    if int(input_data['freq'].sum()) == 0:
        print("Data sums to zero.")
        return (input_data)

    else:
        input_data.loc[input_data['cause'] == "neo_other_cancer",
                       'cause'] = "neo_other"

    output_cols = [
        'registry_index', 'year_start', 'year_end', 'year_id', 'sex',
        'coding_system', 'split_group', 'age', 'cause', 'freq'
    ]
    proportion_uids = [
        'dev_status', 'super_region', 'region', 'subnational_level1',
        'subnational_level2', 'country', 'location_id', 'registry_index',
        'year_start', 'year_end', 'year_id', 'sex', 'coding_system',
        'split_group', 'age'
    ]
    residual_cause = 'ZZZ'
    resources_dir = utils.get_path("mi_dataset_resources",
                                   process="mi_dataset")
    package_folder = '{}/redistribution/{}'.format(resources_dir, PACKAGE_MAP)
    cause_map_file = package_folder + '/_package_map.csv'
    cause_map = pd.read_csv(cause_map_file)
    packages = get_packages(input_data, package_folder, cause_map)
    if len(packages) == 0:
        print("No packages available with which to redistribute this data.")
        return (input_data)

    start_time = time.time()
    prepped_data, proportion_metadata = prep_data(
        input_data, proportion_uids, residual_cause=residual_cause)
    evaluated_cause_map = evaluate_cause_restrictions(cause_map,
                                                      proportion_metadata,
                                                      proportion_uids)
    result, diagnostics = apply_packages(prepped_data,
                                         proportion_metadata,
                                         evaluated_cause_map,
                                         packages,
                                         residual_cause=residual_cause)
    output_data = result.merge(proportion_metadata, on='proportion_id')

    output_data.loc[output_data['cause'] == "neo_other",
                    'cause'] = "neo_other_cancer"

    output_data = output_data.ix[:, output_cols]
    output_data.loc[:, 'freq'].fillna(value=0, inplace=True)
    diff = output_data['freq'].sum() - input_data['freq'].sum()
    assert abs(diff) < 1, "Difference from input after rdp is too large"
    return (output_data)
Exemple #15
0
def run():
    ''' Finalizes data for CoD prep, then runs CoD prep's format code
    '''
    finalized_file = utils.get_path("formatted_CoD_data",
                                    process="cod_mortality")
    CoD_format_script = utils.get_path("finalize_CoD_input",
                                       process="cod_mortality")
    input_file = utils.get_path("nonzero_floor", process="cod_mortality")
    df = pd.read_csv(input_file)
    # Ensure that there is a single entry by uid (data will not be collapsed
    #   after this point)
    assert not df[df.duplicated(get_uid_cols())].any().any(), \
        "Duplicate values present at input"
    del df['dataset_id']
    df.rename(columns={'dataset_ids': 'dataset_id'}, inplace=True)
    df = add_subdiv(df)
    df = add_CoD_variables(df)
    df = format_CoD_variables(df)
    df = test_output(df)
    df.to_csv(finalized_file, index=False)
    return (df)
Exemple #16
0
def get_work_dir(parent_meid, cnf_model_version_id):
    ''' Returns the work directory for the current split
        -- Inputs
            parent_meid : modelable_entity_id of the parent modelable_entity 
                that is being split
            cnf_model_version_id : the run_id of the cancer nonfatal model that is
                to be split
    '''
    work_dir = "{root}/cnf_run_{model}/{parent}".format(
        root=utils.get_path('epi_splits', process='cancer_model'),
        model=cnf_model_version_id,
        parent=parent_meid)
    return (work_dir)
def run(cod_mortality_version_id):
    ''' Runs merge between incidence and MIR estimates to generate mortaliy estimate
            output
        Arguments:
            cod_mortality_version_id : int
                - table id that refers to cod_mortality_version table in Cancer's
                database. This ID helps select the correct mir_model_version and
                staged_incidence_version
    '''
    output_file = utils.get_path("mortality_estimates",
                                 process='cod_mortality')
    input_file = utils.get_path("projected_incidence", process="cod_mortality")
    df = pd.read_csv(input_file)

    # temporary drop exceptions. this needs to occurs in a better spot!!!
    drop_loc_ids = [332, 338, 339, 363, 370, 387]
    df = df.loc[~df['location_id'].isin(drop_loc_ids), ]
    df = calculate_mortality(df, cod_mortality_version_id)
    df = apply_recode(df)
    # Validate and save
    df.to_csv(output_file, index=False)
    print("    deaths calculated and recoded.")
    return (df)
Exemple #18
0
def load_lambda_file(cnf_model_version_id):
    ''' Using the cnf_lambda_version_id, returns the datestamp suffix
            of that version
    '''
    lambda_file_default = utils.get_path(
        "lambda_values", process="nonfatal_model")
    record = nd.get_run_record(cnf_model_version_id)
    lambda_version = record.at[0, 'cnf_lambda_version_id']
    db_link = cdb.db_api()
    this_version = db_link.get_entry(table_name='cnf_lambda_version', 
                                uniq_col='cnf_lambda_version_id',
                                val=lambda_version)
    suffix = str(this_version.at[0, 'date_generated'])
    lambda_file = lambda_file_default.replace("<date>", suffix)
    return(lambda_file)
Exemple #19
0
def load_surv_folder(cnf_model_version_id):
    ''' Using the cnf_lambda_version_id, returns the datestamp suffix
            of that version
    '''
    surv_folder = surv_folder = utils.get_path("relative_survival",
                                             process="nonfatal_model")
    record = nd.get_run_record(cnf_model_version_id)
    rs_version = record.at[0, 'rel_survival_version_id']
    db_link = cdb.db_api()
    this_version = db_link.get_entry(table_name='rel_survival_version', 
                                uniq_col='rel_survival_version_id',
                                val=rs_version)
    suffix =  str(this_version.at[0, 'date_generated'])
    rs_folder = surv_folder.replace("<date>", suffix)
    return(rs_folder)
def get_current_mi_results(cod_mortality_version_id):
    ''' Returns the current/best compiled MI results
    '''
    db_link = cdb.db_api()
    mor_config = db_link.get_table('cod_mortality_version')

    # get mir_model_version_id
    mir_id = mor_config.loc[mor_config['cod_mortality_version_id'] ==
                            cod_mortality_version_id,
                            'mir_model_version_id'][len(mor_config) - 1]
    print('using mir model version id...{}'.format(mir_id))
    mi_path = utils.get_path(process='mir_model', key='compiled_mir_results')

    # replace suffix with model_version_id
    mi_path = mi_path.replace('<mir_model_version_id>', str(mir_id))
    compiled_mi_df = pd.read_csv(mi_path)
    return compiled_mi_df
def run_nonzero_floor():   
    ''' enforces cause-, age-, sex-, and year-specific minimums on non-zero 
        cause fractions. It allows cause fractions to be 0, but not to be 
        arbitrarily small.
    '''
    print('running nonzero floor regression...')
    db_link = cdb.db_api()
    nr_df = pd.read_csv(utils.get_path(process='cod_mortality', 
                                    key='noise_reduced'))
    cause_df = db_link.get_table('cod_model_entity')
    cause_df = cause_df.loc[cause_df['is_active'].eq(1),['cause_id','acause']]
    cause_metadata = nr_df.merge(cause_df, how='left', on='acause')
    cause_hierarchy = get_cause_hierarchy()
    nonzero_floorer = ra.NonZeroFloorer(cause_metadata)
    df = nonzero_floorer.get_computed_dataframe(get_pop(), get_env(), cause_hierarchy)
    df.to_csv(<FILE PATH>)
    return 
Exemple #22
0
def manage_split(source_cid, target_cids, proportion_meids, work_dir, description):
    ''' Manages the split of the source_cid followed by saving of the targets, 
            then returns boolean indication of success
    '''
    utils.ensure_dir(work_dir)
    # split model
    d_step = utils.get_gbd_parameter('current_decomp_step')
    df = split_cod_model(source_cause_id=source_cid,
                         target_cause_ids=target_cids,
                         target_meids=proportion_meids,
                         output_dir=work_dir,
                         decomp_step=d_step
                         )
    print(
       print("Split data saved to " + work_dir + " at " +
             utils.display_timestamp()))
    # Generate a list of arguments (one for each child me)
    save_args_template = "--target {targ} --desc {desc} --indir {dir}"
    save_arg_list = []
    for t in target_cids:
        save_arg_list += [save_args_template.format(targ=t,
                                                    desc=description,
                                                    dir=work_dir)
                          ]
    # Start jobs
    header = description.replace(" ", "_")
    save_worker = utils.get_path("save_cod_worker", process="cancer_model")
    job_dict = cluster_tools.create_jobs(script_path=save_worker,
                                         job_header=description,
                                         memory_request=50,
                                         id_list=target_cids,
                                         script_args=save_arg_list,
                                         use_argparse=True,
                                         project_name="cancer")
    for i in job_dict:
        job_dict[i]['job'].launch()

    # Check for results
    job_descrip = description + " upload"
    success_df = cluster_tools.wait_for_results(job_dict,
                                                jobs_description=job_descrip,
                                                noisy_checker=False,
                                                max_minutes=30)
    success = cluster_tools.validate_success(success_df, description)
    return(success)
Exemple #23
0
def update_repness(df):
    ''' Adds superceeding representative status from the override file
    '''
    repres_update_file = utils.get_path("representativeness_override",
                                        process="staging")
    repres_update = pd.read_csv(repres_update_file)
    repres_update = repres_update[[
        'country_id', 'grouping', 'representative']]
    repres_update.rename(
        columns={'representative': 'update_rep'}, inplace=True)
    df = modeled_locations.add_country_id(df)
    df.loc[df['location_id'] == df['country_id'], 'grouping'] = 'national'
    df.loc[df['grouping'] != 'national', 'grouping'] = 'subnational'
    df = df.merge(repres_update, how='left')
    df.loc[df['update_rep'].notnull(), 'representative'] = df['update_rep']
    df.loc[df['representative'].isnull(), 'representative'] = 0
    df = df.drop(['update_rep', 'grouping'], axis=1)
    return(df)
Exemple #24
0
def prep_data(uid_cols, data_version_id, db):
    ''' getting sample size which is total deaths by sex, year, loc and age 
    + cc_code by that specific acause
    as well as cause fractions which is deaths/sample_size
    '''
    print('prepping input...')
    df = pd.read_csv("{}/final_output.csv".format(
                            utils.get_path("mortality_model", 
                            base_folder = "storage")))
    df.loc[df['cf_raw'].isnull(), 'cf_raw'] = 0 
    # teporarily fill columns to 0 until post-age-sex 
    # and post-rdp mortality values are available
    cf_cols = ['cf_rd', 'cf_corr'] 
    for c in cf_cols: 
        df[c] = 0
    df = df.loc[df['acause'].ne('cc_code'),] # drop cc_code. 
    prepped_df = add_required_columns(df, uid_cols, data_version_id, db)
    return(prepped_df)
Exemple #25
0
def split_liver():
    ''' Submits the liver-cancer-specific information to the split manager
    '''
    # set source and targets
    source_cid = 417  # parent cause_id
    target_cids = [996, 418, 419, 420, 421]  # cause_ids
    proportion_meids = [18763, 2470, 2471, 2472, 2473]  # proportion me_ids
    years = list(range(1980, int(utils.get_gbd_parameter("max_year")) + 1))
    description = "lvr_cncr_split"
    liver_model_path = utils.get_path(
        'cod_splits', process='cancer_model', base_folder='workspace')
    work_dir = "{}/{}".format(liver_model_path, utils.display_timestamp())
    # Run split
    success = manage_split(source_cid, target_cids, proportion_meids, work_dir,
                           description)
    if success:
        print("All CoD liver splits uploaded. " + utils.display_timestamp())
    else:
        print("Error during CoD splits for liver cancer")
Exemple #26
0
def load_sdi_map():
    ''' Loads a map of sdi quintiles by location_id
    '''
    sdi_map = {
        'Low SDI': 1,
        'Low-middle SDI': 2,
        'Middle SDI': 3,
        'High SDI': 4,
        'High-middle SDI': 5
    }
    sdi_data = pd.read_csv(utils.get_path("sdi_quintiles"))
    sdi_data.rename(columns={'sdi_quintile': 'sdi_quintile_name'},
                    inplace=True)
    sdi_data = sdi_data.loc[sdi_data['sdi_quintile_name'].notnull(), :]
    sdi_data['sdi_quintile'] = sdi_data['sdi_quintile_name'].apply(
        lambda x: sdi_map[x])
    sdi_data = modeled_locations.add_country_id(sdi_data)
    sdi_data.loc[sdi_data['country_id'].eq(62), 'sdi_quintile'] = 4
    sdi_data.loc[sdi_data['country_id'].eq(90), 'sdi_quintile'] = 5
    return (sdi_data[['location_id', 'sdi_quintile']])
Exemple #27
0
def add_location_hierarchy_info(df):
    ''' Returns the dataframe (df) with added location information: region,
            super_region, subnational_status, etc.
        Stops RDP if there is a problem with the location information
    '''
    print("    Adding location information.")
    input_len = len(df)
    # Reformat/convert variables and ages
    loc_info_dir = utils.get_path('mi_dataset_resources', process="mi_dataset")
    loc_info_path = loc_info_dir + '/redistribution/location_hierarchy.dta'
    location_hierarchy = pd.read_stata(loc_info_path)
    location_hierarchy = location_hierarchy[[
        'location_id', 'dev_status', 'super_region', 'region', 'country',
        'subnational_level1', 'subnational_level2'
    ]]
    df = df.merge(location_hierarchy, how='left', on='location_id')
    assert not df.location_id.isnull().any(), \
        "Cannot redistribute. Unmapped location ids present."
    assert len(
        df) == input_len, "ERROR: data lost while adding location metadata"
    return (df)
Exemple #28
0
def _get_upload_path(table):
    """Get the path to write the file to."""
    upload_path = utils.get_path(key='cod_upload', process='cod_mortality')
    return upload_path
Exemple #29
0
def submit_sr(calc_df, this_dataset):
    ''' Splits data based on subtotal-recalculation requirement and submits
            jobs as needed to recalculate subtotals. Then returns a re-combined
            dataset with subtotals recalculated
    '''
    def submission_req(df, uid): 
        ''' Returns boolean indicating whether data are to be submitted, 
                qualified by whether subtotals are present and whether any 
                component codes exist that could enable recalculation
        '''
        uid_test = df[df['uniqid'].eq(uid)]
        meets_requirement = bool( has_subtotals(uid_test, 'orig_cause')
                    and components_present(uid_test) )
        return(meets_requirement)

    def output_file_func(id):
        ''' Function fed to get_results relative to the  
        '''
        return(get_sr_file(this_dataset, 'split_output', id[0]))
    
    #
    output_uids = md.get_uid_cols(3)
    metric_name = this_dataset.metric
    job_header = "cnSR_{}_{}".format(dataset_id, data_type_id)
    sr_input_file = get_sr_file(this_dataset, "sr_input")
    worker_script = utils.get_path("subtotal_recalculation_worker",
                                                        process="mi_dataset")
    # convert components to string to enable save in hdf file
    uniqid_map = calc_df[output_uids + ['uniqid', 'orig_cause']
                         ].copy().drop_duplicates()
    submitted_data, unsubmitted_data = cup.split_submission_data(calc_df, 
                                        group_id_col='uniqid',
                                        submission_requirement=submission_req, 
                                        hdf_file=sr_input_file,
                                        regenerate_hdf=False)
    if len(submitted_data) == 0:
        final_results = unsubmitted_data
    else:
        uid_list = submitted_data['uniqid'].unique().tolist()
        sr_jobs = cup.generate_prep_workers(worker_script,
                                    list_of_uids=uid_list,
                                    ds_instance=this_dataset,
                                    job_header=job_header,
                                    is_resubmission=is_resubmission)
        output_files = cup.get_results(sr_jobs, 
                                    output_file_func,
                                    parent_process_name="sr",
                                    noisy_checker=True,
                                    add_resubmission_argument=is_resubmission,
                                    wait_time=5)
        # Re-combine compiled results with the set-aside data, before collapsing
        #   and testing
        results = pe.read_files(output_files)
        results.rename(columns={'cause':'orig_cause','codes_remaining':'cause'},
                       inplace=True)
        results = md.stdz_col_formats(results, additional_float_stubs='uniqid')
        results = results.merge(uniqid_map, how='outer', indicator=True)
        assert results['_merge'].isin(["both", "right_only"]).all(), \
            "Error merging with uids"
        del results['_merge']
        # entries with blank "cause" could not be corrected. replace with the 
        #   original aggregate (will be handled by cause recalculation and rdp).
        results.loc[results['cause'].eq(""), 'cause'] = results['orig_cause']
        #  drop causes that were zeroed in subtotal recalculation 
        results['total'] = results.groupby(output_uids)[metric_name].transform(sum)
        results = results.loc[results['total'].ne(0) &
                                results[metric_name].notnull(), :]
        final_results = results.append(unsubmitted_data)
    # Re-combine with data that were not split
    final_results = dft.collapse(final_results, by_cols=output_uids,
                                    combine_cols=this_dataset.metric)
    return(final_results)
Exemple #30
0
def run(dataset_id, data_type_id, uid):
    ''' Preps data for recalculation then recalculates as necessary
    '''
    this_dataset = md.MI_Dataset(dataset_id, 2, data_type_id)
    dataset_name = this_dataset.name
    metric = this_dataset.metric
    input_file = run_sr.get_sr_file(this_dataset, "sr_input")
    # Exit if output already exists
    output_file = run_sr.get_sr_file(this_dataset, 'split_output', uid)
    print(output_file)
    if os.path.exists(output_file):
        print("     output file found for uid " + str(uid))
        return (None)
    #
    negative_data_ok = is_exception(dataset_id, data_type_id)
    error_folder = utils.get_path("mi_input", base_folder='j_temp')
    subcause_issue_file = '{}/subcause_issue/{}_{}_uid_{}.txt'.format(
        error_folder, dataset_name, data_type_id, uid)
    exception_file = '{}/negative_data/{}_{}_uid_{}.csv'.format(
        error_folder, dataset_name, data_type_id, uid)
    for d in [subcause_issue_file, exception_file, error_folder]:
        utils.ensure_dir(d)
    #
    print("    removing subtotals from uid {}...".format(uid))
    # add data for the given uid
    df = pd.read_hdf(input_file, 'split_{}'.format(uid))
    # Create a list of possible codes so that decimal subcauses are only added
    #   if available
    input_cause_list = sorted(df['orig_cause'].unique().tolist())
    # create a dictionary for codes in the selected uid and attach the uid's
    #   data
    uid_subset = {}
    input_data = {}
    # process decimals first and ranges last to ensure that nested causes are
    #   removed
    for c in sorted(df['orig_cause'].unique().tolist()):
        uid_subset[c] = {}
        input_data[c] = {}
        uid_subset[c]['codes'] = []
        uid_subset[c]['subcodes'] = []
        if "-" not in c and "," not in c:
            uid_subset[c]['codes'].append(c)
            # add subcodes to 'subcode' key
            df.loc[df['orig_cause'].eq(c), 'cause'].dropna().unique().tolist()
            for subcode in sorted(df['cause'].where(
                    df['orig_cause'] == c).dropna().unique().tolist()):
                if subcode != c:
                    uid_subset[c]['subcodes'].append(subcode)
            # if none of the subcodes appear in the list, set the cause as a
            #   subcode of itself (prevents the addition of unused decimal
            #   causes)
            if not len(uid_subset[c]['subcodes']):
                uid_subset[c]['subcodes'] = uid_subset[c]['codes']
            elif (not any('{}.'.format(sub[:3]) in check
                          for check in input_cause_list
                          for sub in uid_subset[c]['subcodes'])):
                uid_subset[c]['subcodes'] = uid_subset[c]['codes']
        else:
            for code in sorted(df['cause'].where(
                    df['orig_cause'].eq(c)).dropna().unique().tolist()):
                uid_subset[c]['codes'].append(code)
                uid_subset[c]['subcodes'].append(code)
        # create other lists associated with the cause and add the metric data
        uid_subset[c]['subcauses_remaining'] = []
        uid_subset[c]['codes_removed'] = []
        uid_subset[c]['causes_removed'] = []
        uid_subset[c]['data'] = df.loc[df['cause'].eq(c),
                                       ['age', metric]].set_index('age')
        input_data[c]['data'] = uid_subset[c]['data']
        input_data[c]['codes'] = uid_subset[c]['codes']
    # Determine subcauses and highest number of causes remaining (how many
    #   subcauses are contained within each cause)
    uid_set = set_subcauses(uid_subset, subcause_issue_file)
    highest_level = determine_highest_level(uid_set)
    # remove lowest level codes from parent causes
    if highest_level == 0:
        print('     no subcauses present.')
    else:
        subcauses_removed = True
        while subcauses_removed:
            uid_set, subcauses_removed = remove_subcauses(
                uid_set, uid, exception_file)
            # remove duplicates
            uid_set = remove_duplicates(uid_set)
            # re-set subcauses and num_subcause_remaining
            uid_set, highest_level = set_subcauses(
                uid_set,
                subcause_issue_file,
            )
            print("     subcauses removed.")
    # Prepare Output
    print("saving output...")
    output = pd.DataFrame(
        columns=['cause', 'codes_remaining', 'codes_removed', 'age', metric])
    for c in uid_set:
        # format cause information
        cause_data = pd.DataFrame(
            columns=['cause', 'codes_remaining', 'codes_removed'])
        cause_data.loc[0, ['cause']] = c
        # if nothing was removed, or there was only a single cause, or all of
        #   the input codes are still present, set the codes remaining as the
        #   cause
        if (not len(uid_set[c]['codes_removed'])
                or ("-" not in c and "," not in c)
                or set(input_data[c]['codes']) <= set(uid_set[c]['codes'])):
            cause_data.loc[0, ['codes_remaining']] = c
        else:
            cause_data.loc[0, ['codes_remaining']] = ','.join(
                convert_to_range(uid_set[c]['codes']))
        cause_data.loc[0, ['codes_removed']] = ','.join(
            convert_to_range(uid_set[c]['codes_removed']))
        # format output data
        output_data = uid_set[c]['data']
        output_data['age'] = output_data.index
        output_data['cause'] = c
        orig_data = input_data[c]['data']
        orig_data['age'] = orig_data.index
        orig_data = orig_data.rename(columns={metric: 'orig_metric_value'})
        orig_data['cause'] = c
        # combine and add to output
        final = pd.merge(output_data, cause_data, on='cause')
        final = pd.merge(final, orig_data, on=['cause', 'age'])
        output = output.append(final)
    # Create output dataset
    output['uniqid'] = uid
    # Update encoding (bug fix to work around pandas to_stata issue)
    output = md.stdz_col_formats(output, additional_float_stubs='uniqid')
    # Export results
    output.to_csv(output_file, index=False)
    print('\n Done!')
    time.sleep(1)
    return (None)