예제 #1
0
def split_estimates(modelable_entity_id, cnf_model_version_id):
    ''' Call shared function to apply proportions that split the parent 
            modelable entity into component modelable_entities
    '''
    def output_file_func(id):
        return (nd.nonfatalDataset("upload", id).get_output_file('upload'))

    parent_me = modelable_entity_id
    work_dir = get_work_dir(parent_me, cnf_model_version_id)
    utils.ensure_dir(work_dir)
    children_mes, proportion_mes, me_tag = get_me_info(modelable_entity_id,
                                                       parent_me)
    measures = get_measures(me_tag)
    # Clear the work directory (required), then split the model
    utils.clean_directory_tree(work_dir)
    d_step = utils.get_gbd_parameter('current_decomp_step')
    gbd_id = utils.get_gbd_parameter('current_gbd_round')

    if modelable_entity_id == 1678:
        meas_ids = [5, 6]
    else:
        meas_ids = [5]
    split_epi_model(source_meid=parent_me,
                    target_meids=children_mes,
                    prop_meids=proportion_mes,
                    decomp_step=d_step,
                    split_measure_ids=meas_ids,
                    gbd_round_id=gbd_id,
                    output_dir=work_dir)
    print("split data saved to " + work_dir)
예제 #2
0
def save_model_results(df, metric_name, acause, faux_correct):
    ''' Saves a separate output file for each me_tag in the dataframe
    '''
    decomp_str = decomp_prefix_cols(faux_correct)
    uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols
    data_cols = nd.get_columns('{}{}'.format(decomp_str, metric_name))
    draw_cols = nd.get_columns("{}draw_cols".format(decomp_str))
    d_step = utils.get_gbd_parameter('current_decomp_step')
    if metric_name == "incidence":
        measure_id = utils.get_gbd_parameter('incidence_measure_id')
        df.loc[:, 'me_tag'] = 'primary_phase'
    elif metric_name == "prevalence":
        measure_id = utils.get_gbd_parameter('prevalence_measure_id')
    for this_tag in df['me_tag'].unique():
        me_id = nd.get_modelable_entity_id(acause, this_tag)
        if me_id is None:
            continue
        print("me_id " + str(me_id) + " " + this_tag)
        output_data = df.loc[df['me_tag'].eq(this_tag), uid_cols + data_cols]
        output_data.columns = uid_cols + draw_cols
        output_data['modelable_entity_id'] = me_id
        output_data['upper'] = np.NaN
        output_data['lower'] = np.NaN
        output_data['uncertainty_type_value'] = np.NaN
        output_data['is_outlier'] = 0
        output_data['step4_location_year'] = '{} updated estimates'.format(
            d_step)
        nd.save_outputs(
            "final_results",
            output_data,
            acause,
            me_id,
            measure_id,
        )
예제 #3
0
def get_pop(locset_id=8):
    ''' returns population estimates 
    '''
    d_step = utils.get_gbd_parameter('current_decomp_step')
    gbd_id = utils.get_gbd_parameter('current_gbd_round')
    yr_range = range(1980,2030) 
    yr_list = list(yr_range)
    pop_df = get_population(age_group_id=-1, location_id=-1, location_set_id=locset_id,
                            year_id=yr_list,
                            sex_id = -1,
                            decomp_step = d_step,
                            gbd_round_id = gbd_id)
    return(pop_df)
예제 #4
0
def get_env(): 
    ''' returns current gbd envelope 
    '''
    d_step = utils.get_gbd_parameter('current_decomp_step')
    gbd_id = utils.get_gbd_parameter('current_gbd_round')
    yr_range = range(1980,2030) 
    yr_list = list(yr_range)
    env_df = get_envelope(age_group_id=-1, location_id=-1, location_set_id=8,
                            year_id=yr_list,
                            sex_id = -1,
                            decomp_step = d_step,
                            gbd_round_id = gbd_id)
    env_df.rename(columns={"mean": "mean_env"}, inplace=True)
    return env_df
예제 #5
0
def save_worker(target_id, description, input_dir):
    print("saving {}...".format(description))
    d_step = utils.get_gbd_parameter('current_decomp_step')
    years = list(range(1980, int(utils.get_gbd_parameter("max_year")) + 1))
    save_results_cod(input_dir=input_dir,
                     input_file_pattern='death_{location_id}.csv',
                     cause_id=target_id,
                     description=description,
                     sex_id=[1, 2],
                     metric_id=1,
                     year_id=years,
                     mark_best=True,
                     decomp_step=d_step)
    print("model saved.")
예제 #6
0
def add_required_columns(df, uid_cols, data_version_id, db):
    ''' adding in required columns for upload
    '''
    print('adding  required coluuns...')
    final_df = df.copy()
    final_df['data_version_id'] = data_version_id

    # rename column names
    final_df.rename(columns = {"national":"representative_id",
                                'sex':'sex_id',
                                'year':'year_id',
                                },inplace = True)

    # add temp cf columns
    cf_var_cols = ['cf_final_low_rd', 
                'cf_final_high_rd','cf_final_low_ss', 'cf_final_high_ss', 
                'cf_final_low_total', 'cf_final_high_total',
                'variance_rd_logit_cf', 'variance_rd_log_dr']
    for col in cf_var_cols:
        final_df[col] = 0

    # add cause_id 
    cancer_link = cdb.db_api()
    gbd_id = utils.get_gbd_parameter('current_gbd_round')

    # other columns that are constant 
    final_df['underlying_nid'] = np.nan
    final_df['source_id'] = 68 # cancer default 
    final_df['data_type_id'] = 2 # cancer registry

    # add site labels. upload to site table if new sources are present 
    final_df = map_site_id(final_df, db)
    return(final_df)
예제 #7
0
def load_procedure_proportions(procedure_me_id, location_id):
    ''' Downloads estimates for the proportion of the cancer population that
            recieves a given procedure
    '''
    print("    loading procedure proportions...")\
    # get decomp_step

    d_step = utils.get_gbd_parameter('current_decomp_step')
    gbd_id = utils.get_gbd_parameter('current_gbd_round')
    prop_df = get_draws(gbd_id_type='modelable_entity_id',
                        source='epi',
                        measure_id=18,
                        gbd_id=procedure_me_id,
                        location_id=location_id,
                        gbd_round_id=gbd_id,
                        decomp_step=d_step)
    return (prop_df)
예제 #8
0
def save_worker(meid, meas_ids, description, input_dir, cnf_run_id):
    print("saving {}...".format(description))
    d_step = utils.get_gbd_parameter('current_decomp_step')
    gbd_id = utils.get_gbd_parameter('current_gbd_round')
    estimation_yrs = [1990, 2000, 2017]  # temporary for fauxcorrect
    try:
        success_df = save_results_epi(modelable_entity_id=meid,
                                      description=description,
                                      input_dir=input_dir,
                                      measure_id=meas_ids,
                                      mark_best=True,
                                      n_draws=1000,
                                      decomp_step=d_step,
                                      gbd_round_id=gbd_id,
                                      input_file_pattern="{location_id}.h5")
    except:
        success_df = pd.DataFrame()
    return (success_df)
예제 #9
0
def refine_by_cc_code(df):
    ''' Generates a 'cc_code' (measure of the remaining difference between cancer
            mortality and all-cause mortality) and drops data that  that are not
            credible (cancer deaths > 70% of all-cause mortality)
    '''
    uid_cols = ['country_id'] + \
        [c for c in get_uid_cols() if c not in ['acause']]
    # Set max proportion of all-cause mortality that could possibly come from cancer
    max_pct_cancer = 0.70
    print("Entries before cc_code refinement: {}".format(len(df)))
    # Calculate cc_code as the difference between total cancer and all-cause mortality
    loc_list = df['location_id'].unique().tolist()
    loc_list = [l for l in loc_list if str(l) != 'nan']
    env = load_mortality_envelope(loc_list,
                                  df['age_group_id'].unique().tolist(),
                                  df['year_id'].unique().tolist())
    deaths_df = df.loc[
        ~df['acause'].str.contains("neo_leukemia_"), :]  # remove child causes
    deaths_df = deaths_df.groupby(uid_cols, as_index=False).agg({
        'deaths': 'sum',
        'pop': 'mean'
    }).rename(columns={'deaths': 'cancer_deaths'})
    cc_df = deaths_df.merge(
        env,
        how='inner',
        on=['location_id', 'year_id', 'sex_id', 'age_group_id'])
    cc_df['total_deaths'] = cc_df['death_rate'] * cc_df['pop']
    cc_df.loc[:, ['total_deaths', 'cancer_deaths']] = \
        cc_df[['total_deaths', 'cancer_deaths']].fillna(0)
    valid_estimates = (cc_df['cancer_deaths'] <=
                       max_pct_cancer * cc_df['total_deaths'])
    cc_df = cc_df.loc[valid_estimates, :]
    cc_df['deaths'] = cc_df['total_deaths'] - cc_df['cancer_deaths']
    cc_df['acause'] = "cc_code"
    cc_df['registry_index'] = "0.0.1"
    cc_df['NID'] = utils.get_gbd_parameter('generic_cancer_nid')
    cc_df['dataset_id'] = 3
    cc_df = cc_df.drop(['total_deaths', 'cancer_deaths', 'death_rate'], axis=1)
    cc_df.drop_duplicates(inplace=True)
    # Attach cc_code data to main dataset and return. First subset df to only
    #   those uids with valid cc_code values, then append the full cc_code values
    # subset output to only valid cc_code
    output = df.merge(cc_df[uid_cols], how='inner')
    print("Entries after cc_code refinement: {}".format(len(output)))
    output = output.append(cc_df)  # append
    df = modeled_locations.add_sdi_quintile(df, delete_existing=True)
    print("Final entries with cc_code attached: {}".format(len(output)))
    assert not output[output.duplicated(get_uid_cols())].any().any(), \
        "Duplicate entries present at end of refine_by_cc_code"
    assert not df['deaths'].isnull().any(), \
        "Mortality estimates lost while calulating cc_code"
    return (output)
예제 #10
0
def extract_single_nid(nid_entry):
    ''' Checks the length of
    '''
    try:
        nid_entry = literal_eval(nid_entry)
    except ValueError:
        pass
    if not isinstance(nid_entry, tuple) or isinstance(nid_entry, list):
        nid_entry = [nid_entry]
    if len(nid_entry) == 1:
        if str(nid_entry[0]).isdigit() and str(nid_entry[0]) != '0':
            return (int(nid_entry[0]))
        else:
            pass
    return (int(utils.get_gbd_parameter('generic_cancer_nid')))
예제 #11
0
def format_CoD_variables(df):
    ''' Updates data formats to comply with CoD specifications
    '''
    print("updating variable formats...")
    # Ensure presence of single NID column
    df.rename(columns={'NID': 'nid_input'}, inplace=True)
    df['NID'] = utils.get_gbd_parameter(
        'generic_cancer_nid')  # NOTE: long runtime
    # update age categories to reflect CoD categories
    df['age'] = df['age_group_id'] + 1
    df.loc[df['age_group_id'] >= 30, 'age'] = df['age_group_id'] - 8
    df.loc[df['age_group_id'] == 235, 'age'] = 25
    # Test output
    uid_cols = list(set(get_uid_cols()) - set('location_id')) + ['iso3']
    assert not df[df.duplicated(uid_cols)].any().any(), \
        "Duplicate values present at end of add_CoD_variables"
    return (df)
예제 #12
0
def get_data_version_id_cols(launch_set_id): 
    ''' Returns a dictionary with default values for columnns from data_version_id table 
    '''
    gbd_id = utils.get_gbd_parameter('current_gbd_round')
    desc = get_cod_description()
    new_dv_entry = {'gbd_round_id' : gbd_id,
                    'nid': 284465,
                    'underlying_nid' :np.nan ,
                    'data_type_id': 2,
                    'status_start': datetime.now(),
                    'source_id': 68,
                    'launch_set_id': launch_set_id,
                    'description': desc,
                    'status' : 2,
                    'tool_type_id' : 9
                    }
    return new_dv_entry
예제 #13
0
def manage_split(source_cid, target_cids, proportion_meids, work_dir, description):
    ''' Manages the split of the source_cid followed by saving of the targets, 
            then returns boolean indication of success
    '''
    utils.ensure_dir(work_dir)
    # split model
    d_step = utils.get_gbd_parameter('current_decomp_step')
    df = split_cod_model(source_cause_id=source_cid,
                         target_cause_ids=target_cids,
                         target_meids=proportion_meids,
                         output_dir=work_dir,
                         decomp_step=d_step
                         )
    print(
       print("Split data saved to " + work_dir + " at " +
             utils.display_timestamp()))
    # Generate a list of arguments (one for each child me)
    save_args_template = "--target {targ} --desc {desc} --indir {dir}"
    save_arg_list = []
    for t in target_cids:
        save_arg_list += [save_args_template.format(targ=t,
                                                    desc=description,
                                                    dir=work_dir)
                          ]
    # Start jobs
    header = description.replace(" ", "_")
    save_worker = utils.get_path("save_cod_worker", process="cancer_model")
    job_dict = cluster_tools.create_jobs(script_path=save_worker,
                                         job_header=description,
                                         memory_request=50,
                                         id_list=target_cids,
                                         script_args=save_arg_list,
                                         use_argparse=True,
                                         project_name="cancer")
    for i in job_dict:
        job_dict[i]['job'].launch()

    # Check for results
    job_descrip = description + " upload"
    success_df = cluster_tools.wait_for_results(job_dict,
                                                jobs_description=job_descrip,
                                                noisy_checker=False,
                                                max_minutes=30)
    success = cluster_tools.validate_success(success_df, description)
    return(success)
예제 #14
0
def calc_procedure_tenplus(inc_df, proportions, acause, location_id,
                           faux_correct):
    ''' Multiplies incidence draws by the procedure proportion and the absolute
            survival proportion at 10 years to estimate the number of cases
            surviving for at least 10 years
    '''
    # Load known values
    print(
        "    calculating the incidence of procedures with surv > ten years...")
    decomp_str = decomp_prefix_cols(faux_correct)
    uid_cols = nd.nonfatalDataset().uid_cols
    type_cols = nd.get_columns('{}incidence'.format(decomp_str))
    draw_cols = nd.get_columns("{}draw_cols".format(decomp_str))
    abs_surv_draw_cols = nd.get_columns(
        '{}absolute_survival'.format(decomp_str))
    max_estimation_year = utils.get_gbd_parameter('max_year')
    max_survival_months = nd.nonfatalDataset().max_survival_months
    # Estimate incidence of procedure
    mrg_df = inc_df.merge(proportions)
    adj_df = mrg_df[uid_cols]
    num_procedures = (mrg_df[type_cols].values * mrg_df[draw_cols].values)
    adj_df[type_cols] = pd.DataFrame(num_procedures).fillna(0)
    # Estimate number of procedures resulting in survival beyond ten years
    surv_df = load_estimates('survival', acause, location_id, faux_correct)
    surv_df = surv_df.loc[surv_df['survival_month'].eq(max_survival_months),
                          uid_cols + abs_surv_draw_cols]
    adj_df = adj_df.merge(surv_df)
    pbt_df = adj_df[uid_cols]
    num_procedures_10ys = adj_df[type_cols].values * \
                                        adj_df[abs_surv_draw_cols].values
    pbt_df[draw_cols] = pd.DataFrame(num_procedures_10ys).fillna(0)
    # Update years and age categories
    pbt_df.loc[:, 'age_group_id'] = pbt_df['age_group_id'].apply(
        add_decade_to_age)
    pbt_df.loc[:, 'year_id'] += 10
    # drop data that are now out of scope
    pbt_df = pbt_df.loc[pbt_df['year_id'] <= max_estimation_year, :]
    # For procedures whose sequelae are fractional,
    if sequelae_fractions(acause):
        pbt_df = split_sequelae(pbt_df, acause, location_id)
    else:
        pbt_df.loc[:, 'modelable_entity_id'] = \
            nd.get_modelable_entity_id(acause, 'procedure_sequelae')
    return (pbt_df)
예제 #15
0
def split_liver():
    ''' Submits the liver-cancer-specific information to the split manager
    '''
    # set source and targets
    source_cid = 417  # parent cause_id
    target_cids = [996, 418, 419, 420, 421]  # cause_ids
    proportion_meids = [18763, 2470, 2471, 2472, 2473]  # proportion me_ids
    years = list(range(1980, int(utils.get_gbd_parameter("max_year")) + 1))
    description = "lvr_cncr_split"
    liver_model_path = utils.get_path(
        'cod_splits', process='cancer_model', base_folder='workspace')
    work_dir = "{}/{}".format(liver_model_path, utils.display_timestamp())
    # Run split
    success = manage_split(source_cid, target_cids, proportion_meids, work_dir,
                           description)
    if success:
        print("All CoD liver splits uploaded. " + utils.display_timestamp())
    else:
        print("Error during CoD splits for liver cancer")
예제 #16
0
def load_mortality_envelope(location_id_list, age_group_list, year_list):
    ''' Returns the current all-cause mortality envelope
    '''
    dstep = utils.get_gbd_parameter('current_decomp_step')
    env = get_envelope(sex_id=[1, 2],
                       location_id=location_id_list,
                       year_id=year_list,
                       age_group_id=age_group_list,
                       decomp_step=dstep)
    env.rename(columns={'mean': 'envelope'}, inplace=True)
    pop = get_population(sex_id=[1, 2],
                         location_id=location_id_list,
                         year_id=year_list,
                         age_group_id=age_group_list,
                         decomp_step=dstep)
    env = env.merge(pop,
                    on=['location_id', 'year_id', 'sex_id', 'age_group_id'])
    env['death_rate'] = env['envelope'] / env['population']
    env = env[[
        'location_id', 'year_id', 'sex_id', 'age_group_id', 'death_rate'
    ]]
    return (env)
예제 #17
0
def clear_prev_data_version_status(database, table):
    """Update the data_version table.
    """
    date = make_db_datestamp()  
    can_nid = utils.get_gbd_parameter('generic_cancer_nid')
    update_query = """
        UPDATE cod.{tbl}
        SET status_end = "{dt}", status='0'
        WHERE nid={nid}
        AND status=1
    """
    #conn_string = cdb.create_connection_string('testcod')
    engine = get_engine(conn_def=table)
    conn = engine.connect()

    res = conn.execute(update_query.format(
        tbl=table,
        dt=date,
        nid=can_nid 
    ))

    conn.close()
예제 #18
0
    def _check_all_floors_exist(self, nzf_df):
        ''' Check that all expected cancers, ages, and years, are present and have
            nonzero floor values 
        '''
        def _remove_ages_less_than(a, b):
            '''
            '''
            orig_list = a.copy()
            for val in orig_list:
                if b == 5 & val in [2, 3, 4]:
                    continue
                if val < b:
                    a.remove(val)
            return a

        print("CHECKING FOR ALL CAUSES, AGES, and YEARS...")
        # create cause_list
        db_link = cdb.db_api(db_connection_name='cancer_db')
        gbd_id = utils.get_gbd_parameter('current_gbd_round')
        registry_entity = db_link.get_table('registry_input_entity')
        registry_entity = registry_entity.loc[
            registry_entity['gbd_round_id'].eq(gbd_id)
            & registry_entity['is_active'].eq(1), ]
        cancer_metadata = registry_entity[[
            'acause', 'cause_id', 'yll_age_start', 'yll_age_end'
        ]]
        causes_checklist = registry_entity['acause'].unique().tolist()

        # exceptions for nonzero floors
        causes_checklist.remove('neo_nmsc_bcc')
        causes_checklist.remove('neo_ben_intest')
        causes_checklist.remove('neo_ben_utr')
        causes_checklist.remove('neo_ben_other')
        causes_checklist.remove('neo_ben_brain')
        causes_checklist.remove('_gc')

        # create year_list
        year_start = utils.get_gbd_parameter('min_year_cod')
        year_end = utils.get_gbd_parameter('max_year')  # + 1 for GBD2020
        year_checklist = list(range(year_start, year_end))

        # sex &  age_id checklist
        age_id_checklist = [
            5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 31,
            32, 235, 2, 3, 4
        ]  #age_ids for 0-95 ages
        sex_checklist = [1, 2]

        # print any causes/years/sexes that are expected and missing
        for cancer in causes_checklist:
            print('working on...{}'.format(cancer))
            subset = nzf_df.loc[nzf_df['acause'].eq(cancer), ]
            age_start = int(
                cancer_metadata.loc[cancer_metadata['acause'].eq(cancer),
                                    'yll_age_start'])
            age_start = (age_start /
                         5) + 5  # conversion from age to GBD age_group_id
            if len(subset) == 0:
                print('MISSING CAUSE... {} '.format(cancer))
            missing_ages = set(age_id_checklist) - set(
                subset['age_group_id'].unique().tolist())
            missing_ages = list(missing_ages)
            missing_ages = _remove_ages_less_than(missing_ages, age_start)
            if len(missing_ages) > 0:
                print('missing the following ages for {}: {}'.format(
                    cancer, missing_ages))
            missing_sexes = set(sex_checklist) - set(
                subset['sex_id'].unique().tolist())
            if len(missing_sexes) > 0:
                print('missing the following sexes for {}: {}'.format(
                    cancer, missing_sexes))
            missing_years = set(year_checklist) - set(
                subset['year_id'].unique().tolist())
            if len(missing_years) > 0:
                print('missing the following years for {}: {}'.format(
                    cancer, missing_years))
        return