def im_draw(df, draw_num, surv_uids): ''' Returns the dataframe with estimate of absolute survival for the requested draw_num ''' # Subset to only the necessary data max_surv = nd.nonfatalDataset().max_survival_months draw_uids = nd.nonfatalDataset().uid_cols abs_surv_col = nd.get_columns("absolute_survival") increm_mort_col = nd.get_columns("incremental_mortality") # Calculate incremental mortality, the number of people who have lived # with the disease for each period (those who die in year one # had the disease for only a year) df[increm_mort_col] = df.sort_values(surv_uids).groupby( draw_uids)[abs_surv_col].diff(-1).fillna(0).clip(lower=0) # Calculate the number of people surviving with the disease at and # beyond the maximum year at_max_surv_months = (df['survival_month'] == max_surv) mort_total = df[~at_max_surv_months ].groupby(draw_uids, as_index=False )[increm_mort_col].agg(np.sum ).rename(columns={increm_mort_col: 'total_mort'}) df = df.merge(mort_total) df.loc[at_max_surv_months, increm_mort_col] = 1 - df['total_mort'] # test and return assert not df.isnull().any().any(), "Error in im_draw {}".format(i) return(df.loc[:, surv_uids+[increm_mort_col]])
def generate_estimates(acause, location_id): ''' Applies procedure adjustments where necessary, then saves separate outputs by measure and cancer phase ''' print("Begin final adjustments...") inc_df = load_estimates('incidence', acause, location_id) prev_input = load_estimates('prevalence', acause, location_id) prev_df = calc_total_prevalence(prev_input, uid_cols=nd.nonfatalDataset( 'prevalence', acause).uid_cols) pr_id = procedure_me_id(acause) if pr_id is not None: prop_df = load_procedure_proportions(pr_id, location_id) prev_df = apply_procdedure_proportions(prev_df, prop_df, acause, 'prevalence') proc_data = calc_procedure_tenplus(inc_df, prop_df, acause, location_id) save_procedure_inputs(proc_data, acause, location_id) save_model_results(inc_df, 'incidence', acause) save_model_results(prev_df, 'prevalence', acause) else: save_model_results(inc_df, 'incidence', acause) save_model_results(prev_df, 'prevalence', acause) success_file = nd.nonfatalDataset('final_results', acause).get_output_file("finalized_" + str(location_id)) open(success_file, 'a').close() print(str(success_file) + " saved.") return (True)
def load_incidence(acause, location_id): ''' Returns incidence estimation subset required for prevalence estimation ''' uid_cols = nd.nonfatalDataset().uid_cols inc_cols = nd.get_columns("incidence") input_file = nd.nonfatalDataset( "incidence", acause).get_output_file(location_id) inc_data = pd.read_csv(input_file)[uid_cols+inc_cols] return(inc_data[uid_cols + inc_cols])
def load_survival(acause, location_id): ''' Returns survival estimation subset required for prevalence estimation ''' uid_cols = nd.nonfatalDataset("survival", acause).uid_cols abs_surv_col = nd.get_columns("absolute_survival") this_dataset = nd.nonfatalDataset("survival", acause) input_file = this_dataset.get_output_file(location_id) surv_data = pd.read_csv(input_file) return(surv_data[uid_cols + [abs_surv_col]])
def save_procedure_inputs(df, acause, location_id): '''' Formats and saves procedure data for upload into the epi database ''' uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id'] draw_cols = nd.get_columns("draw_cols") epi_estimate_cols = ['mean', 'lower', 'upper'] data = df.loc[:, uid_cols + draw_cols].copy() # apply formatting data.loc[df['age_group_id'].isin([33, 44, 301]), 'age_group_id'] = 235 data = dft.collapse(data, by_cols=uid_cols, stub='draw') epi_df = epi_upload.format_draws_data(data) epi_df = epi_upload.convert_to_rate(epi_df, epi_estimate_cols, location_id) # Add metadata epi_df['measure'] = 'incidence' epi_df['unit_type'] = "Person*year" epi_df['extractor'] = getuser() epi_df['location_id'] = location_id # Finalize and export for me_id in epi_df['modelable_entity_id'].unique(): print("me_id " + str(me_id) + " sequela split") me_table = nd.load_me_table() bundle_id = int(me_table.loc[me_table['modelable_entity_id'].eq(me_id), 'bundle_id'].item()) this_output = epi_df.loc[epi_df['modelable_entity_id'].eq(me_id), :] this_output = epi_upload.EpiUploadDataframe(this_output).data # Save output without testing (epi formatter has already tested data per # epi specs) # add location_id to enable save_outputs this_output['location_id'] = location_id nd.save_outputs("dismod_inputs", this_output, acause, bundle_id, skip_testing=True)
def save_model_results(df, metric_name, acause): ''' Saves a separate output file for each me_tag in the dataframe ''' uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols data_cols = nd.get_columns(metric_name) draw_cols = nd.get_columns("draw_cols") if metric_name == "incidence": measure_id = utils.get_gbd_parameter('incidence_measure_id') df.loc[:, 'me_tag'] = 'primary_phase' elif metric_name == "prevalence": measure_id = utils.get_gbd_parameter('prevalence_measure_id') for this_tag in df['me_tag'].unique(): me_id = nd.get_modelable_entity_id(acause, this_tag) if me_id is None: continue print("me_id " + str(me_id) + " " + this_tag) output_data = df.loc[df['me_tag'].eq(this_tag), uid_cols + data_cols] output_data.columns = uid_cols + draw_cols output_data['modelable_entity_id'] = me_id nd.save_outputs( "final_results", output_data, acause, me_id, measure_id, )
def generate_estimates(acause, location_id): ''' Runs the prevalence estimation pipeline ''' output_file = nd.nonfatalDataset( "prevalence", acause).get_output_file(location_id) print("Begin prevalence estimation...") surv_df = load_survival(acause, location_id) mort_df = calc_mortality(surv_df, acause, location_id) sequela_framework = load_sequela_framework(surv_df, acause) prev_df = calc_prevalence(sequela_framework, mort_df, acause) nd.save_outputs("prevalence", prev_df, acause)
def calc_prevalence(sequela_framework, mort_df, acause): ''' ''' print(" calculating prevalence...") prev_cols = nd.get_columns('prevalence') mort_cols = nd.get_columns('mortality') surv_uids = nd.nonfatalDataset("survival", acause).uid_cols prev_uids = nd.nonfatalDataset("prevalence", acause).uid_cols # Create the prevalence estimation frame from the survival and mortality # frames mrg_df = pd.merge(sequela_framework, mort_df) df = mrg_df[surv_uids + ['me_tag']] # Calculate prevalence of each sequela by multiplying sequela duration # by the number of people surviving for only that duration df[prev_cols] = mrg_df[mort_cols].mul(mrg_df['sequela_duration'], axis=0) df = dft.collapse(df, combine_cols=prev_cols, by_cols=prev_uids, func='sum') df.loc[:, prev_cols] = df[prev_cols] / 12 # convert to years assert not df.isnull().any().any(), "Error in im_draw {}".format(i) return(df)
def create_estimation_frame(acause, location_id, cnf_model_run_id): ''' Returns a dataframe containing the ages and covariates used to estimate survival and incremental mortality ''' print(" creating estimation frame...") max_surv = nd.nonfatalDataset().max_survival_months uid_cols = nd.nonfatalDataset().uid_cols keep_ages = list(range(1, 21)) + list(range(30, 34)) + [235] # load and subset survival curve to match the estimation parameters surv_data = load_rel_surv_values(acause, location_id, cnf_model_run_id) surv_data['survival_month'] = surv_data['survival_year'] * 12 surv_data = surv_data.loc[surv_data['survival_month'] <= max_surv] # merge with lambda values to create the survival estimation_frame lambda_input = load_lambda_values(location_id, cnf_model_run_id) estim_frame = surv_data.merge(lambda_input[uid_cols + ['lambda']]) estim_frame = estim_frame.loc[ estim_frame['age_group_id'].isin(keep_ages), :] estim_frame['lambda_years'] = (estim_frame['lambda'] * (estim_frame['survival_year'])) return (estim_frame)
def calc_procedure_tenplus(inc_df, proportions, acause, location_id): ''' Multiplies incidence draws by the procedure proportion and the absolute survival proportion at 10 years to estimate the number of cases surviving for at least 10 years ''' # Load known values print( " calculating the incidence of procedures with surv > ten years...") uid_cols = nd.nonfatalDataset().uid_cols type_cols = nd.get_columns('incidence') draw_cols = nd.get_columns("draw_cols") abs_surv = [nd.get_columns("absolute_survival")] max_estimation_year = utils.get_gbd_parameter('max_year') max_survival_months = nd.nonfatalDataset().max_survival_months # Estimate incidence of procedure mrg_df = inc_df.merge(proportions) adj_df = mrg_df[uid_cols] num_procedures = (mrg_df[type_cols].values * mrg_df[draw_cols].values) adj_df[type_cols] = pd.DataFrame(num_procedures).fillna(0) # Estimate number of procedures resulting in survival beyond ten years surv_df = load_estimates('survival', acause, location_id) surv_df = surv_df.loc[surv_df['survival_month'].eq(max_survival_months), uid_cols + abs_surv] adj_df = adj_df.merge(surv_df) pbt_df = adj_df[uid_cols] num_procedures_10ys = adj_df[type_cols].values * \ adj_df[abs_surv].values pbt_df[draw_cols] = pd.DataFrame(num_procedures_10ys).fillna(0) # Update years and age categories pbt_df.loc[:, 'age_group_id'] = pbt_df['age_group_id'].apply( add_decade_to_age) pbt_df.loc[:, 'year_id'] += 10 # drop data that are now out of scope pbt_df = pbt_df.loc[pbt_df['year_id'] <= max_estimation_year, :] # For procedures whose sequelae are fractional, if sequelae_fractions(acause): pbt_df = split_sequelae(pbt_df, acause, location_id) else: pbt_df.loc[:, 'modelable_entity_id'] = \ nd.get_modelable_entity_id(acause, 'procedure_sequelae') return (pbt_df)
def load_estimates(metric_name, acause, location_id): ''' Loads previously-generated estimates per the metric_name ''' this_step = nd.nonfatalDataset(metric_name, acause) uid_cols = this_step.uid_cols if metric_name == "survival": type_cols = [nd.get_columns("absolute_survival")] else: type_cols = nd.get_columns(metric_name) # input_file = this_step.get_output_file(location_id) input_data = pd.read_csv(input_file) return (input_data[uid_cols + type_cols])
def load_rel_surv_values(acause, location_id, cnf_model_run_id): ''' Loads and returns survival best-case/worst-case estimations for the given acause ''' print(" loading survival...") uid_cols = nd.nonfatalDataset().uid_cols rel_surv_col = nd.get_columns("relative_survival") sex_restrictions = { 'neo_prostate': 1, 'neo_testicular': 1, 'neo_cervical': 2, 'neo_ovarian': 2, 'neo_uterine': 2 } # Load specific input based on run_id surv_folder = load_surv_folder(cnf_model_run_id) input_file = "{}/{}/{}.csv".format(surv_folder, acause, location_id) # import and update names this_surv = pd.read_csv(input_file) this_surv.rename(columns={ 'year': 'year_id', 'sex': 'sex_id' }, inplace=True) # Add 'year 0' survival equal to 1 (no time has passed through which to survive) this_surv['scaled_0year'] = 1 # Subset by sex if acause in sex_restrictions.keys(): this_surv = this_surv.loc[this_surv['sex_id'] == sex_restrictions[acause], :] # Reshape and rename columns this_surv = dft.wide_to_long(this_surv, stubnames='scaled_', i=uid_cols, j=['survival_year'], drop_others=True) this_surv = this_surv.loc[ this_surv['survival_year'] != '10year_restrict', :] this_surv.loc[:, 'survival_year'] = this_surv['survival_year'].str.replace( 'year', '').astype(int) this_surv.rename(columns={'scaled_': rel_surv_col}, inplace=True) # extend age groups if not present this_surv = _fix_survival_ages(this_surv) # Test and return assert not this_surv.isnull().any().any(), \ "Null values found in relative survival input after formatting" pe.validate_proportions(this_surv[rel_surv_col]) return (this_surv)
def calc_increm_mort(surv_df, acause, location_id): ''' Returns a dataframe of incremental survival estimates by uid ''' def im_draw(df, draw_num, surv_uids): ''' Returns the dataframe with estimate of absolute survival for the requested draw_num ''' # Subset to only the necessary data max_surv = nd.nonfatalDataset().max_survival_months draw_uids = nd.nonfatalDataset().uid_cols abs_surv_col = nd.get_columns("absolute_survival") increm_mort_col = nd.get_columns("incremental_mortality") # Calculate incremental mortality, the number of people who have lived # with the disease for each period (those who die in year one # had the disease for only a year) df[increm_mort_col] = df.sort_values(surv_uids).groupby( draw_uids)[abs_surv_col].diff(-1).fillna(0).clip(lower=0) # Calculate the number of people surviving with the disease at and # beyond the maximum year at_max_surv_months = (df['survival_month'] == max_surv) mort_total = df[~at_max_surv_months ].groupby(draw_uids, as_index=False )[increm_mort_col].agg(np.sum ).rename(columns={increm_mort_col: 'total_mort'}) df = df.merge(mort_total) df.loc[at_max_surv_months, increm_mort_col] = 1 - df['total_mort'] # test and return assert not df.isnull().any().any(), "Error in im_draw {}".format(i) return(df.loc[:, surv_uids+[increm_mort_col]]) # Generate incremental mortality draws output_uids = nd.nonfatalDataset("survival", acause).uid_cols abs_surv_cols = [nd.get_columns("absolute_survival")] incr_mort_cols = [nd.get_columns("incremental_mortality")] output_df = surv_df.loc[:, output_uids] print(" estimating incremental mortality proportion...") # Note: this section remains written with a loop to facilitate future # processing of absolute survival draws for i, as_col in enumerate(abs_surv_cols): this_draw = im_draw(df=surv_df.loc[:, output_uids + [as_col]], draw_num=i, surv_uids=output_uids) output_df = output_df.merge(this_draw, on=output_uids) return(output_df[output_uids + incr_mort_cols])
def calc_mortality(surv_df, acause, location_id): ''' Calculate mortality, the number of people who die of the cause during the interval (year), where mort= incremental_mortality_proportion*incidence. Returns a datafrane of mortality by uid ''' print(" estimating absolute mortality...") uid_cols = nd.nonfatalDataset("survival", acause).uid_cols inc_cols = nd.get_columns("incidence") incr_mort_cols = [nd.get_columns('incremental_mortality')] mort_cols = nd.get_columns('mortality') incr_mort_df = calc_increm_mort(surv_df, acause, location_id) inc_df = load_incidence(acause, location_id) mrg_df = incr_mort_df.merge(inc_df) df = mrg_df[uid_cols] df[mort_cols] = \ pd.DataFrame(mrg_df[inc_cols].values * mrg_df[incr_mort_cols].values) df = df.merge(incr_mort_df) return(df)
def split_sequelae(df, acause, location_id): ''' Splits estimates into sequela based on proportions from literature ''' print(" splitting sequelae...") uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id'] draw_cols = nd.get_columns("draw_cols") # Generate dataframe containing the procedure_sequelae fractions fracs = pd.DataFrame().from_dict( sequelae_fractions(acause), orient='index').reset_index().rename( columns={'index': 'modelable_entity_id'}) fracs = fracs[fracs['me_tag'].eq("procedure_sequelae")] fracs['acause'] = acause # Merge dataframe with data df['acause'] = acause split_df = df.merge(fracs) split_df[draw_cols] = split_df[draw_cols].multiply(split_df['fraction'], axis='index') assert split_df[draw_cols].notnull().all().all(), "Nulls in split sequelae" return (split_df)
def main(meid, desc, indir, run_id, meas_id): ''' Loads meid information from the cancer database and uses it to run the save_results function ''' this_step = nd.nonfatalDataset("split", meid) success_file = this_step.get_output_file('upload') print("Working on {} ({}) in {}".format(meid, desc, indir, run_id)) success_df = save_worker(meid=meid, meas_ids=meas_id, description=desc, input_dir=indir, cnf_run_id=run_id) # Validate save and preserve record if successful if (len(success_df) > 0) and isinstance(success_df, pd.DataFrame): if 'model_version_id' in success_df.columns: model_id = success_df.at[0, 'model_version_id'] epi_upload.update_upload_record(meid, run_id, model_id, cancer_model_type="split_custom_epi") success_df.to_csv(success_file, index=False) return(True) else: print("Error during split") return(False)
def load_sequela_framework(surv_df, acause): ''' Adjust sequela duration based on survival First adjust incremental sequela duration (including controlled) to equal total months from diagnosis (at midyear) to death (at end year). This is the total amount of time someone may experience any of the sequela (from diagnosis to death, separated out by the amount of time they are living with the cancer) Then iteratively adjust duration of each sequela so time lived with cancer is equal to the sum of all sequela durations First zero-out metastatic_phase and terminal_phase for events that occur at the maximum survival duration. - The terminal phase is set and not adjusted - The most flexible phase is controlled: Adjust controlled time to equal the difference between incremental_duration sequela duration and the duration of each of the other sequela - The next most flexible time is primary diagnosis and treatment - Finally we can adjust the metastatic time if the totals still do not add up ''' def adjust_duration(df, stage, uid_cols): ''' ''' sd_col = 'sequela_duration' input_cols = df.columns.tolist() this_phase = (df['me_tag'] == stage) df = df.merge(df[~this_phase].groupby(uid_cols, as_index=False)[ sd_col].sum().rename(columns={sd_col: 'tot_dur'})) df.loc[this_phase, sd_col] = df['incremental_duration'] - df['tot_dur'] df.loc[this_phase & (df[sd_col] <= 0), sd_col] = 0 assert not df.duplicated(uid_cols+['me_tag']).any(), \ "ERROR: error when calculating sequela_durations for {} stage".format( stage) return(df[input_cols]) # print(" creating sequela framework...") nf_ds = nd.nonfatalDataset("survival", acause) uid_cols = nf_ds.uid_cols max_survival_months = nf_ds.max_survival_months seq_dur = load_durations(acause) # Add sequela durations surv_df.loc[:, 'acause'] = acause df = pd.merge(surv_df[uid_cols + ['acause']], seq_dur, on='acause') df.loc[:, 'raw_sequela_duration'] = df['sequela_duration'] df.loc[:, 'incremental_duration'] = df['survival_month'] + 6 # Set the 'beyond maximum' survival years to the duration of survival # for the final period end_of_period = (df['survival_month'].eq(max_survival_months)) df.loc[end_of_period, 'incremental_duration'] = max_survival_months - 6 # Set late-phase duration to 0 at the end of the survival period # (if someone survives beyond the maximum duration, they are treated # as 'survivors', so there are no terminal or metastatic phases) late_phase = (df['me_tag'].isin(["terminal_phase", "metastatic_phase"])) end_of_period = df['survival_month'].eq(max_survival_months) df.loc[late_phase & end_of_period, 'sequela_duration'] = 0 # Iteratively adjust sequela duration (see docstring for explanation) for stage in ['controlled_phase', "primary_phase", "metastatic_phase"]: df = adjust_duration(df, stage, uid_cols) assert df['incremental_duration'].notnull().all(), \ "error calculating sequela durations" return(df)
def apply_procdedure_proportions(df, proportions, acause, metric_name): ''' Multiplies estimates by procedure proportions, adding to the dataframe a set of estimates for the number of cancer events that do not recieve the given procedure ''' print(" adjusting to avoid double-counting procedures...") # Return if adjustment is unnecessary (if there is no rate id for the cause) uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols draw_cols = nd.get_columns("draw_cols") type_cols = nd.get_columns(metric_name) mrg_cols = [c for c in uid_cols if c != 'me_tag'] # Subset estimates to the phase wherein procedures occur if metric_name == 'prevalence': mrg_df = df.loc[df['me_tag'] == "controlled_phase", :].copy() del mrg_df['me_tag'] elif metric_name == 'incidence': mrg_df = df.copy() # For data where sequela are a fraction of the number of procedures, multiply # the procedure proportion by those fractions if metric_name == 'prevalence' and bool(sequelae_fractions(acause)): # Generate dataframe to containing the fractions fracs = pd.DataFrame().from_dict(sequelae_fractions(acause), orient='index') fracs['acause'] = acause fracs = fracs[~fracs['me_tag'].eq("procedure_sequelae")] # Merge dataframe with proportions to expand proportions['acause'] = acause props = proportions.merge(fracs) # Adjust proportions by me props[draw_cols] = props[draw_cols].multiply(props['fraction'], axis='index') del props['acause'] else: # Determine fraction of population that does not recieve the procedure props = proportions.copy() props['me_tag'] = "adjusted_controlled_phase_a" # Apply proportions to estimates # Note: may drop some data if proportions are only for estimation years mrg_df = mrg_df.merge(props, on=mrg_cols, how='inner') adj_df = mrg_df[uid_cols] evnt_wo_proc = pd.DataFrame(mrg_df[type_cols].values * mrg_df[draw_cols].values).fillna(0) evnt_wo_proc.columns = type_cols adj_df[type_cols] = evnt_wo_proc assert not adj_df.isnull().any().any( ), "Error calculating procedure proportions" # For prevalence, append the adjusted data to the rest of the estimates if metric_name == 'prevalence': sq_df = dft.collapse(adj_df, mrg_cols, combine_cols=type_cols).sort_values(mrg_cols) cntrl_df = df.loc[df['me_tag'].eq("controlled_phase"), :].merge( mrg_df[mrg_cols].drop_duplicates(), on=mrg_cols, how='inner').sort_values(mrg_cols) nosq_df = cntrl_df[mrg_cols] no_proc = pd.DataFrame(cntrl_df[type_cols].values - sq_df[type_cols].values) no_proc.columns = type_cols nosq_df[type_cols] = no_proc nosq_df['me_tag'] = "adjusted_controlled_phase" adj_df = adj_df.append(nosq_df) output_data = df.append(adj_df) # Incidence of cancers with the procedure is estimated elsewhere, so there # is no need to preserve the unadjusted data else: output_data = adj_df return (output_data[uid_cols + type_cols])