def add_coverage_metadata(df): ''' Returns the dataframe with a boolean indication of whether each registry covers it's associated location_id ''' merge_col = ['registry_index'] metadata_cols = ['full_coverage', 'national_registry'] assert all(c in df.columns for c in merge_col), \ "add_coverage_metadata requires {} column(s)".format(merge_col) if (all(c in df.columns for c in metadata_cols) and df.loc[:, metadata_cols].notnull().all().all()): return (df) else: assert df.loc[:, merge_col].notnull().all().all(), \ "add_coverage_metadata cannot accept nulls for {} column(s)".format( merge_col) input_len = len(df) df = df.drop(labels=metadata_cols, axis=1, errors='ignore') reg_df = cdb.db_api().get_table("registry") reg_df.rename(columns={'coverage_of_location_id': 'full_coverage'}, inplace=True) reg_df.loc[(reg_df['location_id'] == reg_df['country_id']) & reg_df['full_coverage'].isin([1]), 'national_registry'] = 1 reg_df.loc[reg_df['national_registry'].isnull(), 'national_registry'] = 0 df = df.merge(reg_df[merge_col + metadata_cols], how='left', on=merge_col) assert len(df) == input_len, "Data dropped while adding coverage metadata" return (df)
def is_exception(dataset_id, data_type_id): ''' Determines if dataset is flagged such that negative values are accepted ''' db_link = cdb.db_api() tbl = db_link.get_table("prep_exception") is_exception = tbl.loc[tbl['dataset_id'].eq(dataset_id) & tbl['data_type_id'].eq(data_type_id) & tbl['prep_exception_type_id'].eq(1) & tbl['processing_status_id'].eq(2), :].any().any() return(is_exception)
def procedure_me_id(acause): ''' If the cause is associated with a tumorectomy procedure, returns the modelable_entity_id of that procedure. Otherwise returns None. ''' me_table = (cdb.db_api('cancer_db')).get_table('cnf_model_entity') me_id = me_table.loc[me_table['is_active'].eq(1) & me_table['acause'].eq(acause) & me_table['me_tag'].eq('procedure_proportion'), 'modelable_entity_id'] if len(me_id) == 0: me_id = None else: me_id = me_id.item() return (me_id)
def load_surv_folder(cnf_model_run_id): ''' Using the cnf_lambda_version_id, returns the datestamp suffix of that version ''' surv_folder = surv_folder = utils.get_path("relative_survival", process="nonfatal_model") record = nd.get_run_record(cnf_model_run_id) rs_version = record.at[0, 'rel_survival_version_id'] db_link = cdb.db_api() this_version = db_link.get_entry(table_name='rel_survival_version', uniq_col='rel_survival_version_id', val=rs_version) suffix = str(this_version.at[0, 'date_updated']) rs_folder = surv_folder.replace("<date>", suffix) return (rs_folder)
def load_lambda_file(cnf_model_run_id): ''' Using the cnf_lambda_version_id, returns the datestamp suffix of that version ''' lambda_file_default = utils.get_path("lambda_values", process="nonfatal_model") record = nd.get_run_record(cnf_model_run_id) lambda_version = record.at[0, 'cnf_lambda_version_id'] db_link = cdb.db_api() this_version = db_link.get_entry(table_name='cnf_lambda_version', uniq_col='cnf_lambda_version_id', val=lambda_version) suffix = str(this_version.at[0, 'date_updated']) lambda_file = lambda_file_default.replace("<date>", suffix) return (lambda_file)
def _add_ihme_pop_marker(df): ''' Returns the dataframe with an added 'ihme_pop_ok' column indicating whether ihme population estimates may be merged with the uid ''' if not 'sdi_quintile' in df.columns: df = modeled_locations.add_sdi_quintile(df) if not 'full_coverage' in df.columns: df = add_coverage_metadata(df) ds_df = cdb.db_api().get_table("dataset") df.loc[:, 'ihme_pop_ok'] = 0 for dsid in df['dataset_id'].unique(): pop_ok = ds_df.loc[ds_df['dataset_id'] == dsid, 'can_use_ihme_pop'].values[0] if pop_ok == 1: df.loc[df['dataset_id'] == dsid, 'ihme_pop_ok'] = pop_ok ihme_pop_ok = (df['sdi_quintile'].isin([5]) & (df['full_coverage'].isin([1]))) df.loc[ihme_pop_ok, 'ihme_pop_ok'] = 1 return (df)
def load_durations(acause): ''' ''' db_link = cdb.db_api('cancer_db') if acause[:8] == "neo_liver_": sequelae_cause = "neo_liver" elif acause == "neo_leukemia_other": sequelae_cause = "neo_leukemia_ll_chronic" elif acause == "neo_nmsc": sequelae_cause = "neo_nmsc_scc" elif acause == "neo_other_cancer": sequelae_cause = "neo_other" else: sequelae_cause = acause sq_df = db_link.get_table('sequela_durations') this_sq = sq_df.loc[sq_df['acause'] == sequelae_cause, :] this_sq.loc[:, 'acause'] = acause assert this_sq['sequela_duration'].notnull().all(), "error loading sequela durations" assert len(this_sq) > 0, "Error loading sequela durations" return(this_sq[['acause', 'me_tag', 'sequela_duration']])
def sequelae_fractions(acause): ''' Defines fractions from lit review to be used when splitting sequela ''' # Set fractions of population recieving treatment resulting in disability pros_incont_frac = 0.18 # pct. who primarily develop incontinence pros_impot_frac = 0.55 # pct. who primarily develop impotence # Define dict fractions = { 'neo_prostate': { # Fractions used to calculate the controlled phase 18781: { 'fraction': pros_impot_frac }, # with impotence 18782: { 'fraction': pros_incont_frac }, # with incontinence # Fractions used to calculate the metrics of sequela beyond ten years 18784: { 'fraction': pros_impot_frac }, 18785: { 'fraction': pros_incont_frac } } } # Add me_tags to dict (enables later linking of data to modelable_entity_id) me_tbl = cdb.db_api().get_table("cnf_model_entity") meids = list(fractions['neo_prostate'].keys()) for me in meids: if me_tbl.loc[me_tbl['modelable_entity_id'].eq(me), 'is_active'].item() == 0: del fractions['neo_prostate'][me] else: tag = me_tbl.loc[me_tbl['modelable_entity_id'].eq(me), 'me_tag'].item() fractions['neo_prostate'][me]['me_tag'] = tag if acause in fractions.keys(): return (fractions[acause]) else: return (False)
def add_representativeness(df): ''' Returns the dataframe with Added 'representative' tag column indicating whether data are representative of their attached location_id ''' def _avg_repness(regs, rep_table): ''' Iff all registries are representative, returns 1. Else returns 0 ''' try: if not isinstance(regs, tuple) and not isinstance(regs, list): try: regs = list(literal_eval(regs)) except: regs = list(regs) rep = rep_table.loc[rep_table['registry_index'].isin(regs), 'representative_of_location_id'] if len(rep) == 0: return (0) else: return (rep.min()) except: return (0) print("adding representative status...") db_link = cdb.db_api("cancer_db") # Add representative status based on the input registries rep_status = db_link.get_table("registry")[[ 'registry_index', 'representative_of_location_id' ]] rep_df = pd.DataFrame( {'registry_index': df['registry_index'].unique().tolist()}) get_repness = partial(_avg_repness, rep_table=rep_status) rep_df.loc[:, 'representative'] = rep_df['registry_index'].apply(get_repness) output = df.merge(rep_df, on='registry_index') output = update_repness(output) assert len(output) == len( df), "add_representativeness is adding or deleting data" return (output)