def im_draw(df, draw_num, surv_uids): ''' Returns the dataframe with estimate of absolute survival for the requested draw_num ''' # Subset to only the necessary data max_surv = nd.nonfatalDataset().max_survival_months draw_uids = nd.nonfatalDataset().uid_cols abs_surv_col = nd.get_columns("absolute_survival") increm_mort_col = nd.get_columns("incremental_mortality") # Calculate incremental mortality, the number of people who have lived # with the disease for each period (those who die in year one # had the disease for only a year) df[increm_mort_col] = df.sort_values(surv_uids).groupby( draw_uids)[abs_surv_col].diff(-1).fillna(0).clip(lower=0) # Calculate the number of people surviving with the disease at and # beyond the maximum year at_max_surv_months = (df['survival_month'] == max_surv) mort_total = df[~at_max_surv_months ].groupby(draw_uids, as_index=False )[increm_mort_col].agg(np.sum ).rename(columns={increm_mort_col: 'total_mort'}) df = df.merge(mort_total) df.loc[at_max_surv_months, increm_mort_col] = 1 - df['total_mort'] # test and return assert not df.isnull().any().any(), "Error in im_draw {}".format(i) return(df.loc[:, surv_uids+[increm_mort_col]])
def save_model_results(df, metric_name, acause): ''' Saves a separate output file for each me_tag in the dataframe ''' uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols data_cols = nd.get_columns(metric_name) draw_cols = nd.get_columns("draw_cols") if metric_name == "incidence": measure_id = utils.get_gbd_parameter('incidence_measure_id') df.loc[:, 'me_tag'] = 'primary_phase' elif metric_name == "prevalence": measure_id = utils.get_gbd_parameter('prevalence_measure_id') for this_tag in df['me_tag'].unique(): me_id = nd.get_modelable_entity_id(acause, this_tag) if me_id is None: continue print("me_id " + str(me_id) + " " + this_tag) output_data = df.loc[df['me_tag'].eq(this_tag), uid_cols + data_cols] output_data.columns = uid_cols + draw_cols output_data['modelable_entity_id'] = me_id nd.save_outputs( "final_results", output_data, acause, me_id, measure_id, )
def calc_abs_surv(df, acause): ''' Returns the dataframe with estimate of absolute survival for the requested draw_num ''' # Generate absolute survival draws print(" estimating absolute survival") abs_surv_col = nd.get_columns("absolute_survival") rel_surv_col = nd.get_columns("relative_survival") df.loc[:, abs_surv_col] = \ (df[rel_surv_col]*np.exp(df['lambda_years'])).clip(upper=1, lower=0) pe.validate_proportions(df[abs_surv_col]) return (df)
def load_estimates(metric_name, acause, location_id): ''' Loads previously-generated estimates per the metric_name ''' this_step = nd.nonfatalDataset(metric_name, acause) uid_cols = this_step.uid_cols if metric_name == "survival": type_cols = [nd.get_columns("absolute_survival")] else: type_cols = nd.get_columns(metric_name) # input_file = this_step.get_output_file(location_id) input_data = pd.read_csv(input_file) return (input_data[uid_cols + type_cols])
def save_procedure_inputs(df, acause, location_id): '''' Formats and saves procedure data for upload into the epi database ''' uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id'] draw_cols = nd.get_columns("draw_cols") epi_estimate_cols = ['mean', 'lower', 'upper'] data = df.loc[:, uid_cols + draw_cols].copy() # apply formatting data.loc[df['age_group_id'].isin([33, 44, 301]), 'age_group_id'] = 235 data = dft.collapse(data, by_cols=uid_cols, stub='draw') epi_df = epi_upload.format_draws_data(data) epi_df = epi_upload.convert_to_rate(epi_df, epi_estimate_cols, location_id) # Add metadata epi_df['measure'] = 'incidence' epi_df['unit_type'] = "Person*year" epi_df['extractor'] = getuser() epi_df['location_id'] = location_id # Finalize and export for me_id in epi_df['modelable_entity_id'].unique(): print("me_id " + str(me_id) + " sequela split") me_table = nd.load_me_table() bundle_id = int(me_table.loc[me_table['modelable_entity_id'].eq(me_id), 'bundle_id'].item()) this_output = epi_df.loc[epi_df['modelable_entity_id'].eq(me_id), :] this_output = epi_upload.EpiUploadDataframe(this_output).data # Save output without testing (epi formatter has already tested data per # epi specs) # add location_id to enable save_outputs this_output['location_id'] = location_id nd.save_outputs("dismod_inputs", this_output, acause, bundle_id, skip_testing=True)
def load_incidence(acause, location_id): ''' Returns incidence estimation subset required for prevalence estimation ''' uid_cols = nd.nonfatalDataset().uid_cols inc_cols = nd.get_columns("incidence") input_file = nd.nonfatalDataset( "incidence", acause).get_output_file(location_id) inc_data = pd.read_csv(input_file)[uid_cols+inc_cols] return(inc_data[uid_cols + inc_cols])
def load_survival(acause, location_id): ''' Returns survival estimation subset required for prevalence estimation ''' uid_cols = nd.nonfatalDataset("survival", acause).uid_cols abs_surv_col = nd.get_columns("absolute_survival") this_dataset = nd.nonfatalDataset("survival", acause) input_file = this_dataset.get_output_file(location_id) surv_data = pd.read_csv(input_file) return(surv_data[uid_cols + [abs_surv_col]])
def calc_increm_mort(surv_df, acause, location_id): ''' Returns a dataframe of incremental survival estimates by uid ''' def im_draw(df, draw_num, surv_uids): ''' Returns the dataframe with estimate of absolute survival for the requested draw_num ''' # Subset to only the necessary data max_surv = nd.nonfatalDataset().max_survival_months draw_uids = nd.nonfatalDataset().uid_cols abs_surv_col = nd.get_columns("absolute_survival") increm_mort_col = nd.get_columns("incremental_mortality") # Calculate incremental mortality, the number of people who have lived # with the disease for each period (those who die in year one # had the disease for only a year) df[increm_mort_col] = df.sort_values(surv_uids).groupby( draw_uids)[abs_surv_col].diff(-1).fillna(0).clip(lower=0) # Calculate the number of people surviving with the disease at and # beyond the maximum year at_max_surv_months = (df['survival_month'] == max_surv) mort_total = df[~at_max_surv_months ].groupby(draw_uids, as_index=False )[increm_mort_col].agg(np.sum ).rename(columns={increm_mort_col: 'total_mort'}) df = df.merge(mort_total) df.loc[at_max_surv_months, increm_mort_col] = 1 - df['total_mort'] # test and return assert not df.isnull().any().any(), "Error in im_draw {}".format(i) return(df.loc[:, surv_uids+[increm_mort_col]]) # Generate incremental mortality draws output_uids = nd.nonfatalDataset("survival", acause).uid_cols abs_surv_cols = [nd.get_columns("absolute_survival")] incr_mort_cols = [nd.get_columns("incremental_mortality")] output_df = surv_df.loc[:, output_uids] print(" estimating incremental mortality proportion...") # Note: this section remains written with a loop to facilitate future # processing of absolute survival draws for i, as_col in enumerate(abs_surv_cols): this_draw = im_draw(df=surv_df.loc[:, output_uids + [as_col]], draw_num=i, surv_uids=output_uids) output_df = output_df.merge(this_draw, on=output_uids) return(output_df[output_uids + incr_mort_cols])
def calc_mortality(surv_df, acause, location_id): ''' Calculate mortality, the number of people who die of the cause during the interval (year), where mort= incremental_mortality_proportion*incidence. Returns a datafrane of mortality by uid ''' print(" estimating absolute mortality...") uid_cols = nd.nonfatalDataset("survival", acause).uid_cols inc_cols = nd.get_columns("incidence") incr_mort_cols = [nd.get_columns('incremental_mortality')] mort_cols = nd.get_columns('mortality') incr_mort_df = calc_increm_mort(surv_df, acause, location_id) inc_df = load_incidence(acause, location_id) mrg_df = incr_mort_df.merge(inc_df) df = mrg_df[uid_cols] df[mort_cols] = \ pd.DataFrame(mrg_df[inc_cols].values * mrg_df[incr_mort_cols].values) df = df.merge(incr_mort_df) return(df)
def calc_prevalence(sequela_framework, mort_df, acause): ''' ''' print(" calculating prevalence...") prev_cols = nd.get_columns('prevalence') mort_cols = nd.get_columns('mortality') surv_uids = nd.nonfatalDataset("survival", acause).uid_cols prev_uids = nd.nonfatalDataset("prevalence", acause).uid_cols # Create the prevalence estimation frame from the survival and mortality # frames mrg_df = pd.merge(sequela_framework, mort_df) df = mrg_df[surv_uids + ['me_tag']] # Calculate prevalence of each sequela by multiplying sequela duration # by the number of people surviving for only that duration df[prev_cols] = mrg_df[mort_cols].mul(mrg_df['sequela_duration'], axis=0) df = dft.collapse(df, combine_cols=prev_cols, by_cols=prev_uids, func='sum') df.loc[:, prev_cols] = df[prev_cols] / 12 # convert to years assert not df.isnull().any().any(), "Error in im_draw {}".format(i) return(df)
def calc_procedure_tenplus(inc_df, proportions, acause, location_id): ''' Multiplies incidence draws by the procedure proportion and the absolute survival proportion at 10 years to estimate the number of cases surviving for at least 10 years ''' # Load known values print( " calculating the incidence of procedures with surv > ten years...") uid_cols = nd.nonfatalDataset().uid_cols type_cols = nd.get_columns('incidence') draw_cols = nd.get_columns("draw_cols") abs_surv = [nd.get_columns("absolute_survival")] max_estimation_year = utils.get_gbd_parameter('max_year') max_survival_months = nd.nonfatalDataset().max_survival_months # Estimate incidence of procedure mrg_df = inc_df.merge(proportions) adj_df = mrg_df[uid_cols] num_procedures = (mrg_df[type_cols].values * mrg_df[draw_cols].values) adj_df[type_cols] = pd.DataFrame(num_procedures).fillna(0) # Estimate number of procedures resulting in survival beyond ten years surv_df = load_estimates('survival', acause, location_id) surv_df = surv_df.loc[surv_df['survival_month'].eq(max_survival_months), uid_cols + abs_surv] adj_df = adj_df.merge(surv_df) pbt_df = adj_df[uid_cols] num_procedures_10ys = adj_df[type_cols].values * \ adj_df[abs_surv].values pbt_df[draw_cols] = pd.DataFrame(num_procedures_10ys).fillna(0) # Update years and age categories pbt_df.loc[:, 'age_group_id'] = pbt_df['age_group_id'].apply( add_decade_to_age) pbt_df.loc[:, 'year_id'] += 10 # drop data that are now out of scope pbt_df = pbt_df.loc[pbt_df['year_id'] <= max_estimation_year, :] # For procedures whose sequelae are fractional, if sequelae_fractions(acause): pbt_df = split_sequelae(pbt_df, acause, location_id) else: pbt_df.loc[:, 'modelable_entity_id'] = \ nd.get_modelable_entity_id(acause, 'procedure_sequelae') return (pbt_df)
def load_rel_surv_values(acause, location_id, cnf_model_run_id): ''' Loads and returns survival best-case/worst-case estimations for the given acause ''' print(" loading survival...") uid_cols = nd.nonfatalDataset().uid_cols rel_surv_col = nd.get_columns("relative_survival") sex_restrictions = { 'neo_prostate': 1, 'neo_testicular': 1, 'neo_cervical': 2, 'neo_ovarian': 2, 'neo_uterine': 2 } # Load specific input based on run_id surv_folder = load_surv_folder(cnf_model_run_id) input_file = "{}/{}/{}.csv".format(surv_folder, acause, location_id) # import and update names this_surv = pd.read_csv(input_file) this_surv.rename(columns={ 'year': 'year_id', 'sex': 'sex_id' }, inplace=True) # Add 'year 0' survival equal to 1 (no time has passed through which to survive) this_surv['scaled_0year'] = 1 # Subset by sex if acause in sex_restrictions.keys(): this_surv = this_surv.loc[this_surv['sex_id'] == sex_restrictions[acause], :] # Reshape and rename columns this_surv = dft.wide_to_long(this_surv, stubnames='scaled_', i=uid_cols, j=['survival_year'], drop_others=True) this_surv = this_surv.loc[ this_surv['survival_year'] != '10year_restrict', :] this_surv.loc[:, 'survival_year'] = this_surv['survival_year'].str.replace( 'year', '').astype(int) this_surv.rename(columns={'scaled_': rel_surv_col}, inplace=True) # extend age groups if not present this_surv = _fix_survival_ages(this_surv) # Test and return assert not this_surv.isnull().any().any(), \ "Null values found in relative survival input after formatting" pe.validate_proportions(this_surv[rel_surv_col]) return (this_surv)
def split_sequelae(df, acause, location_id): ''' Splits estimates into sequela based on proportions from literature ''' print(" splitting sequelae...") uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id'] draw_cols = nd.get_columns("draw_cols") # Generate dataframe containing the procedure_sequelae fractions fracs = pd.DataFrame().from_dict( sequelae_fractions(acause), orient='index').reset_index().rename( columns={'index': 'modelable_entity_id'}) fracs = fracs[fracs['me_tag'].eq("procedure_sequelae")] fracs['acause'] = acause # Merge dataframe with data df['acause'] = acause split_df = df.merge(fracs) split_df[draw_cols] = split_df[draw_cols].multiply(split_df['fraction'], axis='index') assert split_df[draw_cols].notnull().all().all(), "Nulls in split sequelae" return (split_df)
def apply_procdedure_proportions(df, proportions, acause, metric_name): ''' Multiplies estimates by procedure proportions, adding to the dataframe a set of estimates for the number of cancer events that do not recieve the given procedure ''' print(" adjusting to avoid double-counting procedures...") # Return if adjustment is unnecessary (if there is no rate id for the cause) uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols draw_cols = nd.get_columns("draw_cols") type_cols = nd.get_columns(metric_name) mrg_cols = [c for c in uid_cols if c != 'me_tag'] # Subset estimates to the phase wherein procedures occur if metric_name == 'prevalence': mrg_df = df.loc[df['me_tag'] == "controlled_phase", :].copy() del mrg_df['me_tag'] elif metric_name == 'incidence': mrg_df = df.copy() # For data where sequela are a fraction of the number of procedures, multiply # the procedure proportion by those fractions if metric_name == 'prevalence' and bool(sequelae_fractions(acause)): # Generate dataframe to containing the fractions fracs = pd.DataFrame().from_dict(sequelae_fractions(acause), orient='index') fracs['acause'] = acause fracs = fracs[~fracs['me_tag'].eq("procedure_sequelae")] # Merge dataframe with proportions to expand proportions['acause'] = acause props = proportions.merge(fracs) # Adjust proportions by me props[draw_cols] = props[draw_cols].multiply(props['fraction'], axis='index') del props['acause'] else: # Determine fraction of population that does not recieve the procedure props = proportions.copy() props['me_tag'] = "adjusted_controlled_phase_a" # Apply proportions to estimates # Note: may drop some data if proportions are only for estimation years mrg_df = mrg_df.merge(props, on=mrg_cols, how='inner') adj_df = mrg_df[uid_cols] evnt_wo_proc = pd.DataFrame(mrg_df[type_cols].values * mrg_df[draw_cols].values).fillna(0) evnt_wo_proc.columns = type_cols adj_df[type_cols] = evnt_wo_proc assert not adj_df.isnull().any().any( ), "Error calculating procedure proportions" # For prevalence, append the adjusted data to the rest of the estimates if metric_name == 'prevalence': sq_df = dft.collapse(adj_df, mrg_cols, combine_cols=type_cols).sort_values(mrg_cols) cntrl_df = df.loc[df['me_tag'].eq("controlled_phase"), :].merge( mrg_df[mrg_cols].drop_duplicates(), on=mrg_cols, how='inner').sort_values(mrg_cols) nosq_df = cntrl_df[mrg_cols] no_proc = pd.DataFrame(cntrl_df[type_cols].values - sq_df[type_cols].values) no_proc.columns = type_cols nosq_df[type_cols] = no_proc nosq_df['me_tag'] = "adjusted_controlled_phase" adj_df = adj_df.append(nosq_df) output_data = df.append(adj_df) # Incidence of cancers with the procedure is estimated elsewhere, so there # is no need to preserve the unadjusted data else: output_data = adj_df return (output_data[uid_cols + type_cols])