def split_estimates(modelable_entity_id, cnf_model_version_id): ''' Call shared function to apply proportions that split the parent modelable entity into component modelable_entities ''' def output_file_func(id): return (nd.nonfatalDataset("upload", id).get_output_file('upload')) parent_me = modelable_entity_id work_dir = get_work_dir(parent_me, cnf_model_version_id) utils.ensure_dir(work_dir) children_mes, proportion_mes, me_tag = get_me_info(modelable_entity_id, parent_me) measures = get_measures(me_tag) # Clear the work directory (required), then split the model utils.clean_directory_tree(work_dir) d_step = utils.get_gbd_parameter('current_decomp_step') gbd_id = utils.get_gbd_parameter('current_gbd_round') if modelable_entity_id == 1678: meas_ids = [5, 6] else: meas_ids = [5] split_epi_model(source_meid=parent_me, target_meids=children_mes, prop_meids=proportion_mes, decomp_step=d_step, split_measure_ids=meas_ids, gbd_round_id=gbd_id, output_dir=work_dir) print("split data saved to " + work_dir)
def save_model_results(df, metric_name, acause, faux_correct): ''' Saves a separate output file for each me_tag in the dataframe ''' decomp_str = decomp_prefix_cols(faux_correct) uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols data_cols = nd.get_columns('{}{}'.format(decomp_str, metric_name)) draw_cols = nd.get_columns("{}draw_cols".format(decomp_str)) d_step = utils.get_gbd_parameter('current_decomp_step') if metric_name == "incidence": measure_id = utils.get_gbd_parameter('incidence_measure_id') df.loc[:, 'me_tag'] = 'primary_phase' elif metric_name == "prevalence": measure_id = utils.get_gbd_parameter('prevalence_measure_id') for this_tag in df['me_tag'].unique(): me_id = nd.get_modelable_entity_id(acause, this_tag) if me_id is None: continue print("me_id " + str(me_id) + " " + this_tag) output_data = df.loc[df['me_tag'].eq(this_tag), uid_cols + data_cols] output_data.columns = uid_cols + draw_cols output_data['modelable_entity_id'] = me_id output_data['upper'] = np.NaN output_data['lower'] = np.NaN output_data['uncertainty_type_value'] = np.NaN output_data['is_outlier'] = 0 output_data['step4_location_year'] = '{} updated estimates'.format( d_step) nd.save_outputs( "final_results", output_data, acause, me_id, measure_id, )
def get_pop(locset_id=8): ''' returns population estimates ''' d_step = utils.get_gbd_parameter('current_decomp_step') gbd_id = utils.get_gbd_parameter('current_gbd_round') yr_range = range(1980,2030) yr_list = list(yr_range) pop_df = get_population(age_group_id=-1, location_id=-1, location_set_id=locset_id, year_id=yr_list, sex_id = -1, decomp_step = d_step, gbd_round_id = gbd_id) return(pop_df)
def get_env(): ''' returns current gbd envelope ''' d_step = utils.get_gbd_parameter('current_decomp_step') gbd_id = utils.get_gbd_parameter('current_gbd_round') yr_range = range(1980,2030) yr_list = list(yr_range) env_df = get_envelope(age_group_id=-1, location_id=-1, location_set_id=8, year_id=yr_list, sex_id = -1, decomp_step = d_step, gbd_round_id = gbd_id) env_df.rename(columns={"mean": "mean_env"}, inplace=True) return env_df
def save_worker(target_id, description, input_dir): print("saving {}...".format(description)) d_step = utils.get_gbd_parameter('current_decomp_step') years = list(range(1980, int(utils.get_gbd_parameter("max_year")) + 1)) save_results_cod(input_dir=input_dir, input_file_pattern='death_{location_id}.csv', cause_id=target_id, description=description, sex_id=[1, 2], metric_id=1, year_id=years, mark_best=True, decomp_step=d_step) print("model saved.")
def add_required_columns(df, uid_cols, data_version_id, db): ''' adding in required columns for upload ''' print('adding required coluuns...') final_df = df.copy() final_df['data_version_id'] = data_version_id # rename column names final_df.rename(columns = {"national":"representative_id", 'sex':'sex_id', 'year':'year_id', },inplace = True) # add temp cf columns cf_var_cols = ['cf_final_low_rd', 'cf_final_high_rd','cf_final_low_ss', 'cf_final_high_ss', 'cf_final_low_total', 'cf_final_high_total', 'variance_rd_logit_cf', 'variance_rd_log_dr'] for col in cf_var_cols: final_df[col] = 0 # add cause_id cancer_link = cdb.db_api() gbd_id = utils.get_gbd_parameter('current_gbd_round') # other columns that are constant final_df['underlying_nid'] = np.nan final_df['source_id'] = 68 # cancer default final_df['data_type_id'] = 2 # cancer registry # add site labels. upload to site table if new sources are present final_df = map_site_id(final_df, db) return(final_df)
def load_procedure_proportions(procedure_me_id, location_id): ''' Downloads estimates for the proportion of the cancer population that recieves a given procedure ''' print(" loading procedure proportions...")\ # get decomp_step d_step = utils.get_gbd_parameter('current_decomp_step') gbd_id = utils.get_gbd_parameter('current_gbd_round') prop_df = get_draws(gbd_id_type='modelable_entity_id', source='epi', measure_id=18, gbd_id=procedure_me_id, location_id=location_id, gbd_round_id=gbd_id, decomp_step=d_step) return (prop_df)
def save_worker(meid, meas_ids, description, input_dir, cnf_run_id): print("saving {}...".format(description)) d_step = utils.get_gbd_parameter('current_decomp_step') gbd_id = utils.get_gbd_parameter('current_gbd_round') estimation_yrs = [1990, 2000, 2017] # temporary for fauxcorrect try: success_df = save_results_epi(modelable_entity_id=meid, description=description, input_dir=input_dir, measure_id=meas_ids, mark_best=True, n_draws=1000, decomp_step=d_step, gbd_round_id=gbd_id, input_file_pattern="{location_id}.h5") except: success_df = pd.DataFrame() return (success_df)
def refine_by_cc_code(df): ''' Generates a 'cc_code' (measure of the remaining difference between cancer mortality and all-cause mortality) and drops data that that are not credible (cancer deaths > 70% of all-cause mortality) ''' uid_cols = ['country_id'] + \ [c for c in get_uid_cols() if c not in ['acause']] # Set max proportion of all-cause mortality that could possibly come from cancer max_pct_cancer = 0.70 print("Entries before cc_code refinement: {}".format(len(df))) # Calculate cc_code as the difference between total cancer and all-cause mortality loc_list = df['location_id'].unique().tolist() loc_list = [l for l in loc_list if str(l) != 'nan'] env = load_mortality_envelope(loc_list, df['age_group_id'].unique().tolist(), df['year_id'].unique().tolist()) deaths_df = df.loc[ ~df['acause'].str.contains("neo_leukemia_"), :] # remove child causes deaths_df = deaths_df.groupby(uid_cols, as_index=False).agg({ 'deaths': 'sum', 'pop': 'mean' }).rename(columns={'deaths': 'cancer_deaths'}) cc_df = deaths_df.merge( env, how='inner', on=['location_id', 'year_id', 'sex_id', 'age_group_id']) cc_df['total_deaths'] = cc_df['death_rate'] * cc_df['pop'] cc_df.loc[:, ['total_deaths', 'cancer_deaths']] = \ cc_df[['total_deaths', 'cancer_deaths']].fillna(0) valid_estimates = (cc_df['cancer_deaths'] <= max_pct_cancer * cc_df['total_deaths']) cc_df = cc_df.loc[valid_estimates, :] cc_df['deaths'] = cc_df['total_deaths'] - cc_df['cancer_deaths'] cc_df['acause'] = "cc_code" cc_df['registry_index'] = "0.0.1" cc_df['NID'] = utils.get_gbd_parameter('generic_cancer_nid') cc_df['dataset_id'] = 3 cc_df = cc_df.drop(['total_deaths', 'cancer_deaths', 'death_rate'], axis=1) cc_df.drop_duplicates(inplace=True) # Attach cc_code data to main dataset and return. First subset df to only # those uids with valid cc_code values, then append the full cc_code values # subset output to only valid cc_code output = df.merge(cc_df[uid_cols], how='inner') print("Entries after cc_code refinement: {}".format(len(output))) output = output.append(cc_df) # append df = modeled_locations.add_sdi_quintile(df, delete_existing=True) print("Final entries with cc_code attached: {}".format(len(output))) assert not output[output.duplicated(get_uid_cols())].any().any(), \ "Duplicate entries present at end of refine_by_cc_code" assert not df['deaths'].isnull().any(), \ "Mortality estimates lost while calulating cc_code" return (output)
def extract_single_nid(nid_entry): ''' Checks the length of ''' try: nid_entry = literal_eval(nid_entry) except ValueError: pass if not isinstance(nid_entry, tuple) or isinstance(nid_entry, list): nid_entry = [nid_entry] if len(nid_entry) == 1: if str(nid_entry[0]).isdigit() and str(nid_entry[0]) != '0': return (int(nid_entry[0])) else: pass return (int(utils.get_gbd_parameter('generic_cancer_nid')))
def format_CoD_variables(df): ''' Updates data formats to comply with CoD specifications ''' print("updating variable formats...") # Ensure presence of single NID column df.rename(columns={'NID': 'nid_input'}, inplace=True) df['NID'] = utils.get_gbd_parameter( 'generic_cancer_nid') # NOTE: long runtime # update age categories to reflect CoD categories df['age'] = df['age_group_id'] + 1 df.loc[df['age_group_id'] >= 30, 'age'] = df['age_group_id'] - 8 df.loc[df['age_group_id'] == 235, 'age'] = 25 # Test output uid_cols = list(set(get_uid_cols()) - set('location_id')) + ['iso3'] assert not df[df.duplicated(uid_cols)].any().any(), \ "Duplicate values present at end of add_CoD_variables" return (df)
def get_data_version_id_cols(launch_set_id): ''' Returns a dictionary with default values for columnns from data_version_id table ''' gbd_id = utils.get_gbd_parameter('current_gbd_round') desc = get_cod_description() new_dv_entry = {'gbd_round_id' : gbd_id, 'nid': 284465, 'underlying_nid' :np.nan , 'data_type_id': 2, 'status_start': datetime.now(), 'source_id': 68, 'launch_set_id': launch_set_id, 'description': desc, 'status' : 2, 'tool_type_id' : 9 } return new_dv_entry
def manage_split(source_cid, target_cids, proportion_meids, work_dir, description): ''' Manages the split of the source_cid followed by saving of the targets, then returns boolean indication of success ''' utils.ensure_dir(work_dir) # split model d_step = utils.get_gbd_parameter('current_decomp_step') df = split_cod_model(source_cause_id=source_cid, target_cause_ids=target_cids, target_meids=proportion_meids, output_dir=work_dir, decomp_step=d_step ) print( print("Split data saved to " + work_dir + " at " + utils.display_timestamp())) # Generate a list of arguments (one for each child me) save_args_template = "--target {targ} --desc {desc} --indir {dir}" save_arg_list = [] for t in target_cids: save_arg_list += [save_args_template.format(targ=t, desc=description, dir=work_dir) ] # Start jobs header = description.replace(" ", "_") save_worker = utils.get_path("save_cod_worker", process="cancer_model") job_dict = cluster_tools.create_jobs(script_path=save_worker, job_header=description, memory_request=50, id_list=target_cids, script_args=save_arg_list, use_argparse=True, project_name="cancer") for i in job_dict: job_dict[i]['job'].launch() # Check for results job_descrip = description + " upload" success_df = cluster_tools.wait_for_results(job_dict, jobs_description=job_descrip, noisy_checker=False, max_minutes=30) success = cluster_tools.validate_success(success_df, description) return(success)
def calc_procedure_tenplus(inc_df, proportions, acause, location_id, faux_correct): ''' Multiplies incidence draws by the procedure proportion and the absolute survival proportion at 10 years to estimate the number of cases surviving for at least 10 years ''' # Load known values print( " calculating the incidence of procedures with surv > ten years...") decomp_str = decomp_prefix_cols(faux_correct) uid_cols = nd.nonfatalDataset().uid_cols type_cols = nd.get_columns('{}incidence'.format(decomp_str)) draw_cols = nd.get_columns("{}draw_cols".format(decomp_str)) abs_surv_draw_cols = nd.get_columns( '{}absolute_survival'.format(decomp_str)) max_estimation_year = utils.get_gbd_parameter('max_year') max_survival_months = nd.nonfatalDataset().max_survival_months # Estimate incidence of procedure mrg_df = inc_df.merge(proportions) adj_df = mrg_df[uid_cols] num_procedures = (mrg_df[type_cols].values * mrg_df[draw_cols].values) adj_df[type_cols] = pd.DataFrame(num_procedures).fillna(0) # Estimate number of procedures resulting in survival beyond ten years surv_df = load_estimates('survival', acause, location_id, faux_correct) surv_df = surv_df.loc[surv_df['survival_month'].eq(max_survival_months), uid_cols + abs_surv_draw_cols] adj_df = adj_df.merge(surv_df) pbt_df = adj_df[uid_cols] num_procedures_10ys = adj_df[type_cols].values * \ adj_df[abs_surv_draw_cols].values pbt_df[draw_cols] = pd.DataFrame(num_procedures_10ys).fillna(0) # Update years and age categories pbt_df.loc[:, 'age_group_id'] = pbt_df['age_group_id'].apply( add_decade_to_age) pbt_df.loc[:, 'year_id'] += 10 # drop data that are now out of scope pbt_df = pbt_df.loc[pbt_df['year_id'] <= max_estimation_year, :] # For procedures whose sequelae are fractional, if sequelae_fractions(acause): pbt_df = split_sequelae(pbt_df, acause, location_id) else: pbt_df.loc[:, 'modelable_entity_id'] = \ nd.get_modelable_entity_id(acause, 'procedure_sequelae') return (pbt_df)
def split_liver(): ''' Submits the liver-cancer-specific information to the split manager ''' # set source and targets source_cid = 417 # parent cause_id target_cids = [996, 418, 419, 420, 421] # cause_ids proportion_meids = [18763, 2470, 2471, 2472, 2473] # proportion me_ids years = list(range(1980, int(utils.get_gbd_parameter("max_year")) + 1)) description = "lvr_cncr_split" liver_model_path = utils.get_path( 'cod_splits', process='cancer_model', base_folder='workspace') work_dir = "{}/{}".format(liver_model_path, utils.display_timestamp()) # Run split success = manage_split(source_cid, target_cids, proportion_meids, work_dir, description) if success: print("All CoD liver splits uploaded. " + utils.display_timestamp()) else: print("Error during CoD splits for liver cancer")
def load_mortality_envelope(location_id_list, age_group_list, year_list): ''' Returns the current all-cause mortality envelope ''' dstep = utils.get_gbd_parameter('current_decomp_step') env = get_envelope(sex_id=[1, 2], location_id=location_id_list, year_id=year_list, age_group_id=age_group_list, decomp_step=dstep) env.rename(columns={'mean': 'envelope'}, inplace=True) pop = get_population(sex_id=[1, 2], location_id=location_id_list, year_id=year_list, age_group_id=age_group_list, decomp_step=dstep) env = env.merge(pop, on=['location_id', 'year_id', 'sex_id', 'age_group_id']) env['death_rate'] = env['envelope'] / env['population'] env = env[[ 'location_id', 'year_id', 'sex_id', 'age_group_id', 'death_rate' ]] return (env)
def clear_prev_data_version_status(database, table): """Update the data_version table. """ date = make_db_datestamp() can_nid = utils.get_gbd_parameter('generic_cancer_nid') update_query = """ UPDATE cod.{tbl} SET status_end = "{dt}", status='0' WHERE nid={nid} AND status=1 """ #conn_string = cdb.create_connection_string('testcod') engine = get_engine(conn_def=table) conn = engine.connect() res = conn.execute(update_query.format( tbl=table, dt=date, nid=can_nid )) conn.close()
def _check_all_floors_exist(self, nzf_df): ''' Check that all expected cancers, ages, and years, are present and have nonzero floor values ''' def _remove_ages_less_than(a, b): ''' ''' orig_list = a.copy() for val in orig_list: if b == 5 & val in [2, 3, 4]: continue if val < b: a.remove(val) return a print("CHECKING FOR ALL CAUSES, AGES, and YEARS...") # create cause_list db_link = cdb.db_api(db_connection_name='cancer_db') gbd_id = utils.get_gbd_parameter('current_gbd_round') registry_entity = db_link.get_table('registry_input_entity') registry_entity = registry_entity.loc[ registry_entity['gbd_round_id'].eq(gbd_id) & registry_entity['is_active'].eq(1), ] cancer_metadata = registry_entity[[ 'acause', 'cause_id', 'yll_age_start', 'yll_age_end' ]] causes_checklist = registry_entity['acause'].unique().tolist() # exceptions for nonzero floors causes_checklist.remove('neo_nmsc_bcc') causes_checklist.remove('neo_ben_intest') causes_checklist.remove('neo_ben_utr') causes_checklist.remove('neo_ben_other') causes_checklist.remove('neo_ben_brain') causes_checklist.remove('_gc') # create year_list year_start = utils.get_gbd_parameter('min_year_cod') year_end = utils.get_gbd_parameter('max_year') # + 1 for GBD2020 year_checklist = list(range(year_start, year_end)) # sex & age_id checklist age_id_checklist = [ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 31, 32, 235, 2, 3, 4 ] #age_ids for 0-95 ages sex_checklist = [1, 2] # print any causes/years/sexes that are expected and missing for cancer in causes_checklist: print('working on...{}'.format(cancer)) subset = nzf_df.loc[nzf_df['acause'].eq(cancer), ] age_start = int( cancer_metadata.loc[cancer_metadata['acause'].eq(cancer), 'yll_age_start']) age_start = (age_start / 5) + 5 # conversion from age to GBD age_group_id if len(subset) == 0: print('MISSING CAUSE... {} '.format(cancer)) missing_ages = set(age_id_checklist) - set( subset['age_group_id'].unique().tolist()) missing_ages = list(missing_ages) missing_ages = _remove_ages_less_than(missing_ages, age_start) if len(missing_ages) > 0: print('missing the following ages for {}: {}'.format( cancer, missing_ages)) missing_sexes = set(sex_checklist) - set( subset['sex_id'].unique().tolist()) if len(missing_sexes) > 0: print('missing the following sexes for {}: {}'.format( cancer, missing_sexes)) missing_years = set(year_checklist) - set( subset['year_id'].unique().tolist()) if len(missing_years) > 0: print('missing the following years for {}: {}'.format( cancer, missing_years)) return