def generate_estimates(acause, location_id, faux_correct): ''' Applies procedure adjustments where necessary, then saves separate outputs by measure and cancer phase ''' print("Begin final adjustments...") faux_correct = False inc_df = load_estimates('incidence', acause, location_id, faux_correct) prev_input = load_estimates('prevalence', acause, location_id, faux_correct) prev_df = calc_total_prevalence(prev_input, uid_cols=nd.nonfatalDataset( 'prevalence', acause).uid_cols) pr_id = procedure_me_id(acause) if pr_id is not None: prop_df = load_procedure_proportions(pr_id, location_id) prev_df = apply_procdedure_proportions(prev_df, prop_df, acause, 'prevalence', faux_correct) proc_data = calc_procedure_tenplus(inc_df, prop_df, acause, location_id, faux_correct) save_procedure_inputs(proc_data, acause, location_id) save_model_results(inc_df, 'incidence', acause, faux_correct) save_model_results(prev_df, 'prevalence', acause, faux_correct) else: save_model_results(inc_df, 'incidence', acause, faux_correct) save_model_results(prev_df, 'prevalence', acause, faux_correct) success_file = nd.nonfatalDataset('final_results', acause).get_output_file("finalized_" + str(location_id)) open(success_file, 'a').close() print(str(success_file) + " saved.") return (True)
def im_draw(df, draw_num, surv_uids, faux_correct): ''' Returns the dataframe with estimate of absolute survival for the requested draw_num ''' # Subset to only the necessary data decomp_str = decomp_prefix_cols(faux_correct) max_surv = nd.nonfatalDataset().max_survival_months draw_uids = nd.nonfatalDataset().uid_cols # Note: to run with draws, pass draw number to the two following get_columns calls abs_surv_col = '{}surv_abs_{}'.format( decomp_str, draw_num) #nd.get_columns("absolute_survival") increm_mort_col = '{}incr_mort_{}'.format( decomp_str, draw_num ) # nd.get_columns("incremental_mortality") + '_{}'.format(draw_num) # Calculate incremental mortality, the number of people who have lived # with the disease for each period (those who die in year one # had the disease for only a year) df[increm_mort_col] = df.sort_values(surv_uids).groupby( draw_uids)[abs_surv_col].diff(-1).fillna(0).clip(lower=0) # Calculate the number of people surviving with the disease at and # beyond the maximum year at_max_surv_months = (df['survival_month'] == max_surv) mort_total = df[~at_max_surv_months].groupby( draw_uids, as_index=False)[increm_mort_col].agg( np.sum).rename(columns={increm_mort_col: 'total_mort'}) df = df.merge(mort_total) df.loc[at_max_surv_months, increm_mort_col] = 1 - df['total_mort'] # test and return assert not df.isnull().any().any(), "Error in im_draw {}".format(i) return (df.loc[:, surv_uids + [increm_mort_col]])
def calc_prevalence(adjusted_sequelae_durations, mort_df, acause, faux_correct): ''' ''' print(" calculating prevalence...") decomp_str = decomp_prefix_cols(faux_correct) if len(decomp_str) > 0: max_draws = 100 else: max_draws = 1000 prev_cols = nd.get_columns('prevalence') mort_cols = nd.get_columns('mortality') surv_uids = nd.nonfatalDataset("survival", acause).uid_cols prev_uids = nd.nonfatalDataset("prevalence", acause).uid_cols # Create the prevalence estimation frame from the survival and mortality frames mrg_df = pd.merge(adjusted_sequelae_durations, mort_df) df = mrg_df[surv_uids + ['me_tag']] # Calculate prevalence of each sequela by multiplying sequela duration # by the number of people surviving for only that duration for i in list(range(0, max_draws)): df['prev_{}'.format(i)] = mrg_df['deaths_{}'.format(i)].mul( mrg_df['sequela_duration'], axis=0) df = dft.collapse(df, combine_cols=prev_cols, by_cols=prev_uids, func='sum') df.loc[:, prev_cols] = df[prev_cols] / 12 # convert to years assert not df.isnull().any().any(), "Error in im_draw {}".format(i) return (df)
def load_survival(acause, location_id, faux_correct): ''' Returns survival estimation subset required for prevalence estimation ''' decomp_str = decomp_prefix_cols(faux_correct) uid_cols = nd.nonfatalDataset("survival", acause).uid_cols abs_surv_draw_cols = nd.get_columns( '{}absolute_survival'.format(decomp_str)) this_dataset = nd.nonfatalDataset("survival", acause) input_file = this_dataset.get_output_file(location_id) surv_data = pd.read_csv(input_file) return (surv_data[uid_cols + abs_surv_draw_cols])
def load_incidence(acause, location_id, faux_correct): ''' Returns incidence estimation subset required for prevalence estimation ''' decomp_str = decomp_prefix_cols(faux_correct) uid_cols = nd.nonfatalDataset().uid_cols inc_cols = nd.get_columns("incidence".format(decomp_str)) inc_cols = inc_cols[0:1000] input_file = nd.nonfatalDataset("incidence", acause).get_output_file(location_id) inc_data = pd.read_csv(input_file)[uid_cols + inc_cols] return (inc_data[uid_cols + inc_cols])
def save_model_results(df, metric_name, acause, faux_correct): ''' Saves a separate output file for each me_tag in the dataframe ''' decomp_str = decomp_prefix_cols(faux_correct) uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols data_cols = nd.get_columns('{}{}'.format(decomp_str, metric_name)) draw_cols = nd.get_columns("{}draw_cols".format(decomp_str)) d_step = utils.get_gbd_parameter('current_decomp_step') if metric_name == "incidence": measure_id = utils.get_gbd_parameter('incidence_measure_id') df.loc[:, 'me_tag'] = 'primary_phase' elif metric_name == "prevalence": measure_id = utils.get_gbd_parameter('prevalence_measure_id') for this_tag in df['me_tag'].unique(): me_id = nd.get_modelable_entity_id(acause, this_tag) if me_id is None: continue print("me_id " + str(me_id) + " " + this_tag) output_data = df.loc[df['me_tag'].eq(this_tag), uid_cols + data_cols] output_data.columns = uid_cols + draw_cols output_data['modelable_entity_id'] = me_id output_data['upper'] = np.NaN output_data['lower'] = np.NaN output_data['uncertainty_type_value'] = np.NaN output_data['is_outlier'] = 0 output_data['step4_location_year'] = '{} updated estimates'.format( d_step) nd.save_outputs( "final_results", output_data, acause, me_id, measure_id, )
def save_procedure_inputs(df, acause, location_id): '''' Formats and saves procedure data for upload into the epi database ''' uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id'] draw_cols = nd.get_columns("draw_cols") epi_estimate_cols = ['mean', 'lower', 'upper'] data = df.loc[:, uid_cols + draw_cols].copy() # apply formatting data.loc[df['age_group_id'].isin([33, 44, 301]), 'age_group_id'] = 235 data = dft.collapse(data, by_cols=uid_cols, stub='draw') epi_df = epi_upload.format_draws_data(data) epi_df = epi_upload.convert_to_rate(epi_df, epi_estimate_cols, location_id) # Add metadata epi_df['measure'] = 'incidence' epi_df['unit_type'] = "Person*year" epi_df['extractor'] = getuser() epi_df['location_id'] = location_id # Finalize and export for me_id in epi_df['modelable_entity_id'].unique(): print("me_id " + str(me_id) + " sequela split") me_table = nd.load_me_table() bundle_id = int(me_table.loc[me_table['modelable_entity_id'].eq(me_id), 'bundle_id'].item()) this_output = epi_df.loc[epi_df['modelable_entity_id'].eq(me_id), :] this_output = epi_upload.EpiUploadDataframe(this_output).data # Save output without testing (epi formatter has already tested data per # epi specs) # add location_id to enable save_outputs this_output['location_id'] = location_id nd.save_outputs("dismod_inputs", this_output, acause, bundle_id, skip_testing=True)
def load_rel_surv_values(acause, location_id, cnf_model_version_id, faux_correct): ''' Loads and returns survival best-case/worst-case estimations for the given acause ''' print(" loading survival...") decomp_str = decomp_prefix_cols(faux_correct) uid_cols = nd.nonfatalDataset().uid_cols uid_cols = uid_cols + ['surv_year'] scaled_survival = nd.get_columns('{}scaled_survival'.format(decomp_str)) sex_restrictions = {'neo_prostate': 1, 'neo_testicular': 1, 'neo_cervical': 2, 'neo_ovarian': 2, 'neo_uterine': 2} # Load specific input based on version_id surv_folder = load_surv_folder(cnf_model_version_id) input_file = "{}/{}/{}.csv".format(surv_folder, acause, location_id) # import and update names this_surv = pd.read_csv(input_file) this_surv = this_surv.loc[this_surv['surv_year'] <= 10,] this_surv.rename(columns={'year': 'year_id', 'sex': 'sex_id'}, inplace=True) # Add 'year 0' survival equal to 1 (no time has passed through which to survive) tmp = this_surv.loc[this_surv['surv_year'].eq(1),] tmp['surv_year'] = 0 tmp[scaled_survival] = 1 this_surv = this_surv.append(tmp) # Subset by sex if acause in sex_restrictions.keys(): this_surv = this_surv.loc[this_surv['sex_id'] == sex_restrictions[acause],: ] # Test and return assert not this_surv.isnull().any().any(), \ "Null values found in relative survival input after formatting" validate_proportions(this_surv[scaled_survival]) return(this_surv)
def calc_mortality(surv_df, acause, location_id, faux_correct): ''' Calculate mortality, the number of people who die of the cause during the interval (year), where mort= incremental_mortality*incidence. Returns a datafrane of mortality by uid ''' print(" estimating absolute mortality...") decomp_str = decomp_prefix_cols(faux_correct) if len(decomp_str) > 0: max_draws = 100 else: max_draws = 1000 uid_cols = nd.nonfatalDataset("survival", acause).uid_cols inc_cols = nd.get_columns("{}incidence".format(decomp_str)) incr_mort_draw_cols = nd.get_columns( '{}incremental_mortality'.format(decomp_str)) mort_cols = nd.get_columns('{}mortality'.format(decomp_str)) incr_mort_df = calc_increm_mort(surv_df, acause, location_id, faux_correct) inc_df = load_incidence(acause, location_id, faux_correct) mrg_df = incr_mort_df.merge(inc_df) df = mrg_df[uid_cols] for i in list(range(0, max_draws)): df['deaths_{}'.format(i)] = \ pd.DataFrame(mrg_df['inc_{}'.format(i)] * mrg_df['{}incr_mort_{}'.format(decomp_str, i)]) df = df.merge(incr_mort_df) return (df)
def calc_procedure_tenplus(inc_df, proportions, acause, location_id, faux_correct): ''' Multiplies incidence draws by the procedure proportion and the absolute survival proportion at 10 years to estimate the number of cases surviving for at least 10 years ''' # Load known values print( " calculating the incidence of procedures with surv > ten years...") decomp_str = decomp_prefix_cols(faux_correct) uid_cols = nd.nonfatalDataset().uid_cols type_cols = nd.get_columns('{}incidence'.format(decomp_str)) draw_cols = nd.get_columns("{}draw_cols".format(decomp_str)) abs_surv_draw_cols = nd.get_columns( '{}absolute_survival'.format(decomp_str)) max_estimation_year = utils.get_gbd_parameter('max_year') max_survival_months = nd.nonfatalDataset().max_survival_months # Estimate incidence of procedure mrg_df = inc_df.merge(proportions) adj_df = mrg_df[uid_cols] num_procedures = (mrg_df[type_cols].values * mrg_df[draw_cols].values) adj_df[type_cols] = pd.DataFrame(num_procedures).fillna(0) # Estimate number of procedures resulting in survival beyond ten years surv_df = load_estimates('survival', acause, location_id, faux_correct) surv_df = surv_df.loc[surv_df['survival_month'].eq(max_survival_months), uid_cols + abs_surv_draw_cols] adj_df = adj_df.merge(surv_df) pbt_df = adj_df[uid_cols] num_procedures_10ys = adj_df[type_cols].values * \ adj_df[abs_surv_draw_cols].values pbt_df[draw_cols] = pd.DataFrame(num_procedures_10ys).fillna(0) # Update years and age categories pbt_df.loc[:, 'age_group_id'] = pbt_df['age_group_id'].apply( add_decade_to_age) pbt_df.loc[:, 'year_id'] += 10 # drop data that are now out of scope pbt_df = pbt_df.loc[pbt_df['year_id'] <= max_estimation_year, :] # For procedures whose sequelae are fractional, if sequelae_fractions(acause): pbt_df = split_sequelae(pbt_df, acause, location_id) else: pbt_df.loc[:, 'modelable_entity_id'] = \ nd.get_modelable_entity_id(acause, 'procedure_sequelae') return (pbt_df)
def save_splits(modelable_entity_id, cnf_model_version_id): ''' Launch jobs to upload each of the "split" modelable entities, generated by splitting the parent modelable entity ''' def output_file_func(id): return (nd.nonfatalDataset("upload", id).get_output_file('upload')) parent_me = modelable_entity_id work_dir = get_work_dir(parent_me, cnf_model_version_id) this_step = nd.nonfatalDataset("split", parent_me) success_file = this_step.get_output_file('upload') children_mes, skip_mes, me_tag = get_me_info(modelable_entity_id, parent_me) measures = get_measures(me_tag) save_worker = utils.get_path('save_epi_worker', process='cancer_model') # Generate a list of arguments (one for each child me) description = "{}_run_{}".format(me_tag, cnf_model_version_id) save_args_template = ("--meid {meid} --meas_id {meas} --indir {input_dir}" " --cnf_run_id {cnf_rid} --desc {desc}") save_arg_list = [] for cm in children_mes: save_arg_list += [ save_args_template.format(meid=cm, meas=" ".join([str(m) for m in measures]), desc=description, input_dir="{}/{}".format(work_dir, cm), cnf_rid=cnf_model_version_id) ] # Start jobs job_dict = cluster_tools.create_jobs(script_path=save_worker, job_header="lvr_save_epi", memory_request=90, id_list=children_mes, script_args=save_arg_list, use_argparse=True, project_name="cancer") for i in job_dict: job_dict[i]['job'].launch() # Check for results job_description = str(modelable_entity_id) + " split upload" success_df = cluster_tools.wait_for_results( job_dict, jobs_description=job_description, noisy_checker=False, output_file_function=output_file_func, max_minutes=30) success = cluster_tools.validate_success(success_df, job_description) if success: success_df.to_csv(success_file, index=False) return (True) else: print("Error during split") return (False)
def create_estimation_frame(acause, location_id, cnf_model_version_id, faux_correct): ''' Returns a dataframe containing the ages and covariates used to estimate survival and incremental mortality ''' print(" creating estimation frame...") # inc_input = load_inc_data(acause) max_surv = nd.nonfatalDataset().max_survival_months uid_cols = nd.nonfatalDataset().uid_cols keep_ages = list(range(1, 21)) + list(range(30, 34)) + [235] # load and subset survival curve to match the estimation parameters surv_data = load_rel_surv_values(acause, location_id, cnf_model_version_id, faux_correct) surv_data['survival_month'] = surv_data['surv_year'] * 12 surv_data = surv_data.loc[surv_data['survival_month'] <= max_surv] # merge with lambda values to create the survival estimation_frame lambda_input = load_lambda_values(location_id, cnf_model_version_id) estim_frame = surv_data.merge(lambda_input[uid_cols + ['lambda']]) estim_frame = estim_frame.loc[estim_frame['age_group_id'].isin( keep_ages), :] estim_frame['lambda_years'] = ( estim_frame['lambda'] * (estim_frame['surv_year'])) return(estim_frame)
def calc_increm_mort(surv_df, acause, location_id, faux_correct): ''' Returns a dataframe of mortality proportions equal to the mortality delta from the previous survival_year, by uid ''' def im_draw(df, draw_num, surv_uids, faux_correct): ''' Returns the dataframe with estimate of absolute survival for the requested draw_num ''' # Subset to only the necessary data decomp_str = decomp_prefix_cols(faux_correct) max_surv = nd.nonfatalDataset().max_survival_months draw_uids = nd.nonfatalDataset().uid_cols # Note: to run with draws, pass draw number to the two following get_columns calls abs_surv_col = '{}surv_abs_{}'.format( decomp_str, draw_num) #nd.get_columns("absolute_survival") increm_mort_col = '{}incr_mort_{}'.format( decomp_str, draw_num ) # nd.get_columns("incremental_mortality") + '_{}'.format(draw_num) # Calculate incremental mortality, the number of people who have lived # with the disease for each period (those who die in year one # had the disease for only a year) df[increm_mort_col] = df.sort_values(surv_uids).groupby( draw_uids)[abs_surv_col].diff(-1).fillna(0).clip(lower=0) # Calculate the number of people surviving with the disease at and # beyond the maximum year at_max_surv_months = (df['survival_month'] == max_surv) mort_total = df[~at_max_surv_months].groupby( draw_uids, as_index=False)[increm_mort_col].agg( np.sum).rename(columns={increm_mort_col: 'total_mort'}) df = df.merge(mort_total) df.loc[at_max_surv_months, increm_mort_col] = 1 - df['total_mort'] # test and return assert not df.isnull().any().any(), "Error in im_draw {}".format(i) return (df.loc[:, surv_uids + [increm_mort_col]]) # Generate incremental mortality draws decomp_str = decomp_prefix_cols(faux_correct) output_uids = nd.nonfatalDataset("survival", acause).uid_cols abs_surv_draw_cols = nd.get_columns( '{}absolute_survival'.format(decomp_str)) incr_mort_draw_cols = nd.get_columns( '{}incremental_mortality'.format(decomp_str)) output_df = surv_df.loc[:, output_uids] print(" estimating incremental mortality proportion...") # Note: this section remains written with a loop to facilitate future # processing of absolute survival draws for i, as_col in enumerate(abs_surv_draw_cols): this_draw = im_draw(df=surv_df.loc[:, output_uids + [as_col]], draw_num=i, surv_uids=output_uids, faux_correct=faux_correct) output_df = output_df.merge(this_draw, on=output_uids) return (output_df[output_uids + incr_mort_draw_cols])
def generate_estimates(acause, location_id, faux_correct=False): ''' Runs the prevalence estimation pipeline ''' faux_correct = False output_file = nd.nonfatalDataset("prevalence", acause).get_output_file(location_id) print("Begin prevalence estimation...") surv_df = load_survival(acause, location_id, faux_correct) mort_df = calc_mortality(surv_df, acause, location_id, faux_correct) adjusted_sequelae_durations = load_sequela_framework(surv_df, acause) prev_df = calc_prevalence(adjusted_sequelae_durations, mort_df, acause, faux_correct) nd.save_outputs("prevalence", prev_df, acause)
def load_estimates(metric_name, acause, location_id, faux_correct): ''' Loads previously-generated estimates per the metric_name ''' decomp_str = decomp_prefix_cols(faux_correct) this_step = nd.nonfatalDataset(metric_name, acause) uid_cols = this_step.uid_cols if metric_name == "survival": type_cols = nd.get_columns('{}absolute_survival'.format(decomp_str)) else: type_cols = nd.get_columns('{}{}'.format(decomp_str, metric_name)) # input_file = this_step.get_output_file(location_id) input_data = pd.read_csv(input_file) return (input_data[uid_cols + type_cols])
def split_sequelae(df, acause, location_id): ''' Splits estimates into sequela based on proportions from literature ''' print(" splitting sequelae...") uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id'] draw_cols = nd.get_columns("draw_cols") # Generate dataframe containing the procedure_sequelae fractions fracs = pd.DataFrame().from_dict( sequelae_fractions(acause), orient='index').reset_index().rename( columns={'index': 'modelable_entity_id'}) fracs = fracs[fracs['me_tag'].eq("procedure_sequelae")] fracs['acause'] = acause # Merge dataframe with data df['acause'] = acause split_df = df.merge(fracs) split_df[draw_cols] = split_df[draw_cols].multiply(split_df['fraction'], axis='index') assert split_df[draw_cols].notnull().all().all(), "Nulls in split sequelae" return (split_df)
def main(meid, desc, indir, run_id, meas_id): this_step = nd.nonfatalDataset("split", meid) success_file = this_step.get_output_file('upload') print("Working on {} ({}) in {}".format(meid, desc, indir, run_id)) success_df = save_worker(meid=meid, meas_ids=meas_id, description=desc, input_dir=indir, cnf_run_id=run_id) # Validate save and preserve record if successful if (len(success_df) > 0) and isinstance(success_df, pd.DataFrame): if 'model_version_id' in success_df.columns: model_id = success_df.at[0, 'model_version_id'] epi_upload.update_upload_record( meid, run_id, model_id, cancer_model_type="split_custom_epi", success=1) success_df.to_csv(success_file, index=False) return (True) else: print("Error during split") return (False)
def output_file_func(id): return (nd.nonfatalDataset("upload", id[0]).get_output_file('upload'))
def load_sequela_framework(surv_df, acause): ''' Adjust sequela duration based on survival First adjust incremental sequela duration (including controlled) to equal total months from diagnosis (at midyear) to death (at end year). This is total amount of time someone may experience any of the sequela (from diagnosis to death, separated out by the amount of time they are living with cancer) Then iteratively adjust duration of each sequela so time lived with cancer is equal to the sum of all sequela durations 1) zero-out metastatic_phase and terminal_phase for events that occur at the maximum survival duration. 2) Adjust durations to fit the number of survival years i) The terminal phase is set and not adjusted ii) The most flexible phase is controlled: Adjust controlled time to equal the difference between incremental_duration sequela duration and the duration of each of the other sequelae iii) The next most flexible time is primary diagnosis and treatment iv) Finally we can adjust the metastatic time if the totals still do not add up ''' def adjust_duration(df, stage, uid_cols): ''' ''' sd_col = 'sequela_duration' input_cols = df.columns.tolist() this_phase = (df['me_tag'] == stage) df = df.merge(df[~this_phase].groupby( uid_cols, as_index=False)[sd_col].sum().rename(columns={sd_col: 'tot_dur'})) df.loc[this_phase, sd_col] = df['incremental_duration'] - df['tot_dur'] df.loc[this_phase & (df[sd_col] <= 0), sd_col] = 0 assert not df.duplicated(uid_cols+['me_tag']).any(), \ "ERROR: error when calculating sequelae_durations for {} stage".format( stage) return (df[input_cols]) # print(" creating sequela framework...") nf_ds = nd.nonfatalDataset("survival", acause) uid_cols = nf_ds.uid_cols max_survival_months = nf_ds.max_survival_months seq_dur = load_durations(acause) # Add sequela durations surv_df.loc[:, 'acause'] = acause df = pd.merge(surv_df[uid_cols + ['acause']], seq_dur, on='acause') df.loc[:, 'raw_sequela_duration'] = df['sequela_duration'] df.loc[:, 'incremental_duration'] = df['survival_month'] + 6 # Set the 'beyond maximum' survival years to the duration of survival # for the final period end_of_period = (df['survival_month'].eq(max_survival_months)) df.loc[end_of_period, 'incremental_duration'] = max_survival_months - 6 # Set late-phase duration to 0 at the end of the survival period # (if someone survives beyond the maximum duration, they are treated # as 'survivors', so there are no terminal or metastatic phases) late_phase = (df['me_tag'].isin(["terminal_phase", "metastatic_phase"])) end_of_period = df['survival_month'].eq(max_survival_months) df.loc[late_phase & end_of_period, 'sequela_duration'] = 0 # Iteratively adjust sequela duration (see docstring for explanation) for stage in ['controlled_phase', "primary_phase", "metastatic_phase"]: df = adjust_duration(df, stage, uid_cols) assert df['incremental_duration'].notnull().all(), \ "error calculating sequela durations" return (df)
def apply_procdedure_proportions(df, proportions, acause, metric_name, faux_correct): ''' Multiplies estimates by procedure proportions, adding to the dataframe a set of estimates for the number of cancer events that do not receive the given procedure -- Note: As of 2018-07-10, incidence data are adjusted after modeling and are not processed through this function, although the ability to do so remains ''' decomp_str = decomp_prefix_cols(faux_correct) print(" adjusting to avoid double-counting procedures for {}...".format( metric_name)) # Return if adjustment is unnecessary (if there is no rate id for the cause) uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols draw_cols = nd.get_columns("{}draw_cols".format(decomp_str)) type_cols = nd.get_columns('{}{}'.format(decomp_str, metric_name)) mrg_cols = [c for c in uid_cols if c != 'me_tag'] # Subset estimates to the phase wherein procedures occur if metric_name == 'prevalence': mrg_df = df.loc[df['me_tag'] == "controlled_phase", :].copy() del mrg_df['me_tag'] elif metric_name == 'incidence': mrg_df = df.copy() # For data where sequela are a fraction of the number of procedures, multiply # the procedure proportion by those fractions if metric_name == 'prevalence' and bool(sequelae_fractions(acause)): # Generate dataframe to containing the fractions fracs = pd.DataFrame().from_dict(sequelae_fractions(acause), orient='index') fracs['acause'] = acause fracs = fracs[~fracs['me_tag'].eq("procedure_sequelae")] # Merge dataframe with proportions to expand proportions['acause'] = acause props = proportions.merge(fracs) # Adjust proportions by me props[draw_cols] = props[draw_cols].multiply(props['fraction'], axis='index') del props['acause'] else: # Determine fraction of population that does not recieve the procedure props = proportions.copy() props['me_tag'] = "adjusted_controlled_phase_a" # Apply proportions to estimates # Note: may drop some data if proportions are only for estimation years mrg_df = mrg_df.merge(props, on=mrg_cols, how='inner') adj_df = mrg_df[uid_cols] evnt_wo_proc = pd.DataFrame(mrg_df[type_cols].values * mrg_df[draw_cols].values).fillna(0) evnt_wo_proc.columns = type_cols adj_df[type_cols] = evnt_wo_proc assert not adj_df.isnull().any().any( ), "Error calculating procedure proportions" # For prevalence, append the adjusted data to the rest of the estimates if metric_name == 'prevalence': sq_df = dft.collapse(adj_df, mrg_cols, combine_cols=type_cols).sort_values(mrg_cols) cntrl_df = df.loc[df['me_tag'].eq("controlled_phase"), :].merge( mrg_df[mrg_cols].drop_duplicates(), on=mrg_cols, how='inner').sort_values(mrg_cols) nosq_df = cntrl_df[mrg_cols] no_proc = pd.DataFrame(cntrl_df[type_cols].values - sq_df[type_cols].values) no_proc.columns = type_cols nosq_df[type_cols] = no_proc nosq_df['me_tag'] = "adjusted_controlled_phase" adj_df = adj_df.append(nosq_df) output_data = df.append(adj_df) # Incidence of cancers with the procedure is estimated elsewhere, so there # is no need to preserve the unadjusted data else: output_data = adj_df return (output_data[uid_cols + type_cols])