def save_procedure_inputs(df, acause, location_id): '''' Formats and saves procedure data for upload into the epi database ''' uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id'] draw_cols = nd.get_columns("draw_cols") epi_estimate_cols = ['mean', 'lower', 'upper'] data = df.loc[:, uid_cols + draw_cols].copy() # apply formatting data.loc[df['age_group_id'].isin([33, 44, 301]), 'age_group_id'] = 235 data = dft.collapse(data, by_cols=uid_cols, stub='draw') epi_df = epi_upload.format_draws_data(data) epi_df = epi_upload.convert_to_rate(epi_df, epi_estimate_cols, location_id) # Add metadata epi_df['measure'] = 'incidence' epi_df['unit_type'] = "Person*year" epi_df['extractor'] = getuser() epi_df['location_id'] = location_id # Finalize and export for me_id in epi_df['modelable_entity_id'].unique(): print("me_id " + str(me_id) + " sequela split") me_table = nd.load_me_table() bundle_id = int(me_table.loc[me_table['modelable_entity_id'].eq(me_id), 'bundle_id'].item()) this_output = epi_df.loc[epi_df['modelable_entity_id'].eq(me_id), :] this_output = epi_upload.EpiUploadDataframe(this_output).data # Save output without testing (epi formatter has already tested data per # epi specs) # add location_id to enable save_outputs this_output['location_id'] = location_id nd.save_outputs("dismod_inputs", this_output, acause, bundle_id, skip_testing=True)
def prep_for_disagg(df, uid_cols, metric): ''' returns dataset with updated coding systems and re-combines data in preparation for cause disaggregation ''' df.loc[df.coding_system != "ICD9_detail", 'coding_system'] = 'ICD10' df = dft.collapse(df, by_cols=uid_cols, func='sum', stub=metric) return (df)
def calc_total_prevalence(df, uid_cols): ''' Calculates a prevalence "total" value to be uploaded for troubleshooting ''' sum_df = df.loc[df['me_tag'].isin([ 'primary_phase', 'controlled_phase', 'metastatic_phase', 'terminal_phase' ])] sum_df.loc[:, 'me_tag'] = "computational_total" sum_df = dft.collapse(sum_df, by_cols=uid_cols, stub='prev') return (df.append(sum_df))
def update_redistributed_acause(df, ds_instance, split_num): ''' Returns dataframe (df) after merging with maps to update cause information -- Maps: decimal cause map : used to revert cause names to decimal form cause map : used to validate output causes ''' metric_name = ds_instance.metric output_uids = md.get_uid_cols(7) # def manage_rdp_remnants(df, temp_folder, split_num, metric): ''' Verifies if any garbage remains after ''' # Get any codes that didn't merge and save them rdp_error = ((df['acause'].isnull() | (df['_merge'] == 'left_only')) & df[ds_instance.metric].isin([np.nan, 0])) rdp_error_list = sorted(df.loc[rdp_error, 'cause'].unique().tolist()) if len(rdp_error_list): print("The following causes are not in the cause map:") print(rdp_error_list) return (None) # Convert acause back to cancer cause code_format_updates = { 'C0': 'C00', 'C1': 'C01', 'C2': 'C02', 'C3': 'C03', 'C4': 'C04', 'C4A': 'C04', 'C5': 'C05', 'C6': 'C06', 'C7': 'C07', 'C8': 'C08', 'C9': 'C09', 'neo_other': 'neo_other_cancer' } for key, value in code_format_updates.items(): df.loc[df['acause'] == key, 'acause'] = value # Merge with cause map df.rename(columns={'acause': 'cause'}, inplace=True) cause_map = cm.load_rdp_cause_map(ds_instance.data_type_id) df = df.merge(cause_map, how='left', on=['coding_system', 'cause'], indicator=True) # Check that all data were mapped to cause manage_rdp_remnants(df, ds_instance.temp_folder, split_num, metric_name) # Reformat to merge data with original source df = df.loc[:, output_uids + [metric_name]] final_df = dft.collapse(df, by_cols=output_uids, stub=metric_name) return (final_df)
def submit_rdp(input_data, this_dataset, is_resubmission): ''' Returns full dataset after redistribution. Separates data by submission requirement before submitting rdp for only only those data that require it ''' def submission_requirement(df, uid): return needs_rdp(df[df['uid'] == uid], this_dataset) def output_file_function(id): return get_rdp_file(this_dataset, which_file='split_output', splitNum=id[2]) # create a list of the uids that require redistribution and set aside a # dataframe of the uids that do not require redistribution rdp_code_location = utils.get_path("redistribution", base="code_repo", process="mi_dataset") worker_script = rdp_code_location + "/rdp_worker.py" output_uids = md.get_uid_cols(7) header = "cncRDP_{}_{}".format(this_dataset.dataset_id, this_dataset.data_type_id) rdp_input_file = get_rdp_file(this_dataset, which_file='rdp_input') # prepped_df = prep_input(input_data, this_dataset) submitted_data, unsubmitted_data = cup.split_submission_data( prepped_df, 'uid', submission_requirement, rdp_input_file) uid_list = submitted_data['uid'].unique().tolist() rdp_job_dict = cup.generate_prep_workers(worker_script, list_of_uids=uid_list, ds_instance=this_dataset, job_header=header, is_resubmission=is_resubmission, pace_interval=0.05) output_files = cup.get_results(rdp_job_dict, output_file_function, parent_process_name="rdp", noisy_checker=is_resubmission, add_resubmission_argument=is_resubmission, wait_time=5) # Re-combine compiled results with the set-aside data, before collapsing # and testing final_results = pe.append_files(output_files) final_results = final_results.append(unsubmitted_data) # Re-set all 'under 5' data, then collapse to combine it with any existing # 'under 5' data final_results.loc[final_results['age'].lt(7) | (final_results['age'].gt(90) & final_results['age'].lt(95)), 'age'] = 2 final_results = dft.collapse(final_results, by_cols=output_uids, combine_cols=this_dataset.metric) return (final_results)
def main(dataset_id, data_type_id, split_num): ''' ''' # Load input metric_dict = {'2': 'cases', '3': 'deaths'} this_dataset = md.MI_Dataset(dataset_id, 6, data_type_id) metric_name = this_dataset.metric rdp_input = manager.get_rdp_file(this_dataset, 'rdp_input') input_data = pd.read_hdf(rdp_input, 'split_{}'.format(split_num)) # Redistribute data where possible if not manager.needs_rdp(input_data, this_dataset): print(" no redistribution needed for ds {} type {} split {}".format( dataset_id, data_type_id, split_num)) save_worker_output(input_data, this_dataset, split_num) return (input_data) else: print(" redistributing ds {} type {} split {}".format( dataset_id, data_type_id, split_num)) # Add maps to enable RDP input_data.rename(columns={'uid': 'split_group'}, inplace=True) mapped = add_location_hierarchy_info(input_data) # RDP cannot run without location metadata, and should not run for hiv # Set aside those data skip_rdp_mask = cannot_redistribute(mapped) set_aside = mapped.loc[skip_rdp_mask, input_data.columns.tolist()] to_redistribute = mapped.loc[~skip_rdp_mask, :] # Redistribute remaining data if to_redistribute.any().any(): rdp_results = run_rdp_core(to_redistribute, this_dataset, split_num) # Recombine if set_aside.any().any(): rdp_results = rdp_results.append(set_aside, ignore_index=True) to_finalize = rdp_results else: print(" No data to redistribute. Finalizing.") to_finalize = input_data.rename(columns={'cause': 'acause'}) output_cols = md.get_uid_cols(7) to_finalize = cm.correct_causes(to_finalize) finalized_df = dft.collapse(to_finalize, by_cols=output_cols, stub=metric_name) # Check totals (note: because of data precision, data before and after # may not be precisely equivalent) diff = finalized_df[metric_name].sum() - input_data[metric_name].sum() assert abs(diff/input_data[metric_name].sum()) < 0.01, \ "Difference from input after rdp is too large" save_worker_output(finalized_df, this_dataset, split_num) return (finalized_df)
def apply_age_spilt_proportions(input_df, frmat_type, wgt_df, uid_cols, metric): ''' combines weights with population to calculate proportions by which combined age groups are to be split, then splits data by those proportions ''' # remove dataset_id if present in dataframe split_input = input_df.copy() if 'dataset_id' in split_input.columns: del split_input['dataset_id'] # merge with the age format map and get an expanded dataframe with the ages # to be split uids_noAge = [u for u in uid_cols if 'age' != u] uid_cols = uids_noAge + ['age'] marked_df = mark_ages_to_be_split(split_input, frmat_type, uid_cols, metric) to_expand = marked_df.loc[marked_df['to_expand'].eq(1), :].copy() if len(to_expand) == 0: return (split_input) # merge with expected values ("weights") to_expand.rename(columns={ 'age': 'split_age', 'gbd_age': 'age' }, inplace=True) weighted_df = to_expand.merge(wgt_df) astest.test_weights(to_expand, weighted_df) # calculate proportions to_split = pp.add_proportions(weighted_df, uids_noAge + ['split_age']) # adjust by proportions to_split.loc[:, 'split_value'] = to_split[metric] to_split.loc[:, metric] = to_split['proportion'] * to_split['split_value'] # collapse, then update format types of split data recombined_df = to_split.append( marked_df.loc[marked_df['to_expand'] == 0, :]) adjusted_df = dft.collapse(recombined_df, by_cols=uid_cols, func='sum', combine_cols=metric) astest.compare_pre_post_split(split_input, adjusted_df, metric) adjusted_df.loc[:, 'need_split'] = 1 pt.verify_metric_total(split_input, adjusted_df, metric, "apply age proportions") return (adjusted_df[split_input.columns.values])
def calc_prevalence(sequela_framework, mort_df, acause): ''' ''' print(" calculating prevalence...") prev_cols = nd.get_columns('prevalence') mort_cols = nd.get_columns('mortality') surv_uids = nd.nonfatalDataset("survival", acause).uid_cols prev_uids = nd.nonfatalDataset("prevalence", acause).uid_cols # Create the prevalence estimation frame from the survival and mortality # frames mrg_df = pd.merge(sequela_framework, mort_df) df = mrg_df[surv_uids + ['me_tag']] # Calculate prevalence of each sequela by multiplying sequela duration # by the number of people surviving for only that duration df[prev_cols] = mrg_df[mort_cols].mul(mrg_df['sequela_duration'], axis=0) df = dft.collapse(df, combine_cols=prev_cols, by_cols=prev_uids, func='sum') df.loc[:, prev_cols] = df[prev_cols] / 12 # convert to years assert not df.isnull().any().any(), "Error in im_draw {}".format(i) return(df)
def combine_uid_entries(df, uid_cols, metric_cols, combined_cols=['NID', 'registry_index', 'dataset_id'], collapse_metrics=True): ''' Preserves a list of all entries in the combined_cols before collapsing by uid_cols to calculate the sum of the metric_cols Returns a dataframe collapsed by uid_cols -- Inputs df : pandas dataframe uid_cols : list of uniquely identifying columns for the dataframe metric_cols : list of columns containing metric values for each uid combined_cols : list of columns whose values are to be combined into one tuple per uid collapse_metrics : set to False to prevent collapse after re-setting combined cols entries ''' assert not df[uid_cols+combined_cols].isnull().any().any(), \ "Cannot combine dataframe with null values in uid or combined columns" combined_cols = [c for c in combined_cols if c in df.columns] static_cols = [c for c in df.columns if c not in combined_cols] combined_entries = df[static_cols].copy() for col in combined_cols: new_entries = df[uid_cols + [col]].groupby( uid_cols, as_index=False)[col].agg(lambda c: tuple_unique_entries(c)) new_entries.loc[:, col] = new_entries[col].astype(str) combined_entries = combined_entries.merge(new_entries, on=uid_cols, how='left') assert not combined_entries[col].isnull().any(), \ "Error combining uids for column {}".format(col) if collapse_metrics: output = dft.collapse(combined_entries, by_cols=uid_cols + combined_cols, combine_cols=metric_cols, func='sum') else: output = combined_entries return (output)
def main(dataset_id, data_type_id): ''' Disaggregates uids that are mapped to multiple gbd causes, including garbage codes, Kaposi Sarcoma, and non-melanoma skin cancer ''' # prep_step 5 = cause_disaggregation this_dataset = md.MI_Dataset(dataset_id, 5, data_type_id) input_data = this_dataset.load_input() metric = this_dataset.metric uid_cols = md.get_uid_cols(5) input_data = input_data.loc[ ~input_data['age'].isin([26, 3, 4, 5, 6, 91, 92, 93, 94]), :] # Format and add observation numbers formatted_input = prep_for_disagg(input_data.copy(), uid_cols, metric) # Disaggregate disaggregated_df = core.disaggregate_acause(formatted_input, this_dataset) # update uid columns to account for reshaped acause uid_cols = [u for u in uid_cols if 'acause' not in u] + ['acause'] # kaposi_df = core.redist_kaposi(disaggregated_df, metric, uid_cols) if data_type_id == 2: adjusted_df = core.redist_nmsc_gc(kaposi_df, metric) else: adjusted_df = kaposi_df final_df = core.map_remaining_garbage(adjusted_df, data_type_id) # run test functions and save output pt.verify_metric_total(input_data, adjusted_df, metric, "cause disaggregation module") # collapse to incorperate newly-split data output_uids = md.get_uid_cols(6) final_df = md.stdz_col_formats(final_df) final_df = dft.collapse(final_df, by_cols=output_uids, func='sum', combine_cols=metric) # save md.complete_prep_step(final_df, this_dataset) print("Acause disaggregated")
def manage_split(df, metric_name, uid_cols, this_dataset): ''' Converts age and sex categories in the df to those used by the cancer prep process. 1) Adds obs number 2) Splits aggregated ages 3) Combines disaggregated ages 4) Splits unknown age category 5) Splits aggregated/unknown sex category ''' is_pop = bool(metric_name == "pop") # df[metric_name].fillna(value=0, inplace=True) split_df = df.copy() # add observation number by group, without age uids_noAge = [c for c in uid_cols if 'age' not in c] obs_numbers = split_df[uids_noAge].drop_duplicates() obs_numbers['obs'] = obs_numbers.reset_index().index split_df = split_df.merge(obs_numbers) uid_cols.append('obs') # generate cause_weights if is_pop: cause_wgts = pp.gen_pop_wgts("age_wgt", df['location_id'].unique()) else: # create weights used for splitting cause_wgts = pp.create_metric_weights(split_df, uid_cols, this_dataset) # collapse to get one weight per observation cause_wgts = dft.collapse(cause_wgts, by_cols=['obs', 'age', 'sex'], func='sum', combine_cols='wgt') # split if pt.has_nonStdAge(split_df): print(" splitting non-standard age...") split_df = core.split_age(dataset=split_df, wgt_df=cause_wgts, metric=metric_name, uid_cols=uid_cols) # redistribute "unknown age" data according to the current distribution of # cases/deaths if pt.has_age_unk(split_df, metric_name): print(" splitting unknown age...") # create weights split_df = core.split_unknown_age(dataset=split_df, wgt_df=cause_wgts, metric=metric_name, uid_cols=uid_cols) # check for errors. at.compare_pre_post_split(split_df, df, metric_name) # split sex = 3 and sex = 9 data if pt.has_combinedSex(split_df): print(" splitting sex...") if metric_name == "pop": sex_split_prop = pp.gen_pop_wgts( "sex_wgt", df['location_id'].unique()) else: sex_split_prop = pp.create_sex_weights(cause_wgts, uid_vars=['obs', 'age'], metric=metric_name) split_df = core.split_sex(split_df, sex_split_prop, uid_cols, metric=metric_name) # collapse remaining underu5 and 80+ ages final_df = split_df.copy(deep=True) final_df = final_df.loc[~final_df['age'].isin([26]), :] # collapse to incorperate newly-split data output_uids = md.get_uid_cols(5, is_pop) final_df = dft.collapse(final_df, by_cols=output_uids, func='sum', combine_cols=metric_name ) # save and exit md.complete_prep_step(final_df, this_dataset, is_pop) return(None)
def apply_procdedure_proportions(df, proportions, acause, metric_name): ''' Multiplies estimates by procedure proportions, adding to the dataframe a set of estimates for the number of cancer events that do not recieve the given procedure ''' print(" adjusting to avoid double-counting procedures...") # Return if adjustment is unnecessary (if there is no rate id for the cause) uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols draw_cols = nd.get_columns("draw_cols") type_cols = nd.get_columns(metric_name) mrg_cols = [c for c in uid_cols if c != 'me_tag'] # Subset estimates to the phase wherein procedures occur if metric_name == 'prevalence': mrg_df = df.loc[df['me_tag'] == "controlled_phase", :].copy() del mrg_df['me_tag'] elif metric_name == 'incidence': mrg_df = df.copy() # For data where sequela are a fraction of the number of procedures, multiply # the procedure proportion by those fractions if metric_name == 'prevalence' and bool(sequelae_fractions(acause)): # Generate dataframe to containing the fractions fracs = pd.DataFrame().from_dict(sequelae_fractions(acause), orient='index') fracs['acause'] = acause fracs = fracs[~fracs['me_tag'].eq("procedure_sequelae")] # Merge dataframe with proportions to expand proportions['acause'] = acause props = proportions.merge(fracs) # Adjust proportions by me props[draw_cols] = props[draw_cols].multiply(props['fraction'], axis='index') del props['acause'] else: # Determine fraction of population that does not recieve the procedure props = proportions.copy() props['me_tag'] = "adjusted_controlled_phase_a" # Apply proportions to estimates # Note: may drop some data if proportions are only for estimation years mrg_df = mrg_df.merge(props, on=mrg_cols, how='inner') adj_df = mrg_df[uid_cols] evnt_wo_proc = pd.DataFrame(mrg_df[type_cols].values * mrg_df[draw_cols].values).fillna(0) evnt_wo_proc.columns = type_cols adj_df[type_cols] = evnt_wo_proc assert not adj_df.isnull().any().any( ), "Error calculating procedure proportions" # For prevalence, append the adjusted data to the rest of the estimates if metric_name == 'prevalence': sq_df = dft.collapse(adj_df, mrg_cols, combine_cols=type_cols).sort_values(mrg_cols) cntrl_df = df.loc[df['me_tag'].eq("controlled_phase"), :].merge( mrg_df[mrg_cols].drop_duplicates(), on=mrg_cols, how='inner').sort_values(mrg_cols) nosq_df = cntrl_df[mrg_cols] no_proc = pd.DataFrame(cntrl_df[type_cols].values - sq_df[type_cols].values) no_proc.columns = type_cols nosq_df[type_cols] = no_proc nosq_df['me_tag'] = "adjusted_controlled_phase" adj_df = adj_df.append(nosq_df) output_data = df.append(adj_df) # Incidence of cancers with the procedure is estimated elsewhere, so there # is no need to preserve the unadjusted data else: output_data = adj_df return (output_data[uid_cols + type_cols])