def split_neonatal(df, metric_name, uid_cols): '''Splits 0-4 into neonatal age groups ''' is_pop = bool(metric_name == "pop") # df[metric_name].fillna(value=0, inplace=True) split_df = df.copy() # add observation number by group, without age uids_noAge = [c for c in uid_cols if 'age' not in c] split_df = add_missing_ages(split_df, metric_name) obs_numbers = split_df[uid_cols].drop_duplicates() obs_numbers['obs'] = obs_numbers.reset_index().index split_df = split_df.merge(obs_numbers) uid_cols.append('obs') # generate cause_weight if is_pop: cause_wgts = pp.gen_pop_wgts("age_wgt", df['location_id'].unique()) cause_wgts.rename(columns={"year": "year_id"}, inplace=True) else: # create weights used for splitting cause_wgts = create_metric_weights(split_df, uid_cols, metric_name) # collapse to get one weight per observation. dropping redundant entries beforehand cause_wgts.drop_duplicates(subset=['obs', 'age', 'sex_id'], inplace=True) cause_wgts = dft.collapse(cause_wgts, by_cols=['obs', 'age', 'sex_id'], func='sum', combine_cols='wgt') if has_neonatal_age(split_df): print(" splitting neonatal age...") # create weights split_df = split_neo_age(dataset=split_df, wgt_df=cause_wgts, metric=metric_name, uid_cols=get_uid_columns()) # collapse remaining underu5 and 80+ ages final_df = split_df.copy(deep=True) final_df = final_df.loc[~final_df['age'].isin([2]), :] # collapse to incorperate newly-split data final_df = dft.collapse(final_df, by_cols=get_uid_columns(), func='sum', combine_cols=metric_name) final_df = check_and_save(final_df, metric_name) return (final_df)
def combine_uid_entries(df, uid_cols, metric_cols, combined_cols=['NID', 'registry_index', 'dataset_id'], collapse_metrics=True): ''' Preserves a list of all entries in the combined_cols before collapsing by uid_cols to calculate the sum of the metric_cols Returns a dataframe collapsed by uid_cols -- Inputs collapse_metrics : set to False to prevent collapse after re-setting combined cols entries ''' assert not df[uid_cols+combined_cols].isnull().any().any(), \ "Cannot combine dataframe with null values in uid or combined columns" combined_cols = [c for c in combined_cols if c in df.columns] static_cols = [c for c in df.columns if c not in combined_cols] combined_entries = df[static_cols].copy() for col in combined_cols: new_entries = df[uid_cols+[col]].groupby(uid_cols, as_index=False)[col].agg( lambda c: tuple_unique_entries(c)) new_entries.loc[:, col] = new_entries[col].astype(str) combined_entries = combined_entries.merge( new_entries, on=uid_cols, how='left') assert not combined_entries[col].isnull().any(), \ "Error combining uids for column {}".format(col) if collapse_metrics: output = dft.collapse(combined_entries, by_cols=uid_cols+combined_cols, combine_cols=metric_cols, func='sum') else: output = combined_entries return(output)
def calc_prevalence(adjusted_sequelae_durations, mort_df, acause, faux_correct): ''' ''' print(" calculating prevalence...") decomp_str = decomp_prefix_cols(faux_correct) if len(decomp_str) > 0: max_draws = 100 else: max_draws = 1000 prev_cols = nd.get_columns('prevalence') mort_cols = nd.get_columns('mortality') surv_uids = nd.nonfatalDataset("survival", acause).uid_cols prev_uids = nd.nonfatalDataset("prevalence", acause).uid_cols # Create the prevalence estimation frame from the survival and mortality frames mrg_df = pd.merge(adjusted_sequelae_durations, mort_df) df = mrg_df[surv_uids + ['me_tag']] # Calculate prevalence of each sequela by multiplying sequela duration # by the number of people surviving for only that duration for i in list(range(0, max_draws)): df['prev_{}'.format(i)] = mrg_df['deaths_{}'.format(i)].mul( mrg_df['sequela_duration'], axis=0) df = dft.collapse(df, combine_cols=prev_cols, by_cols=prev_uids, func='sum') df.loc[:, prev_cols] = df[prev_cols] / 12 # convert to years assert not df.isnull().any().any(), "Error in im_draw {}".format(i) return (df)
def save_procedure_inputs(df, acause, location_id): '''' Formats and saves procedure data for upload into the epi database ''' uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id'] draw_cols = nd.get_columns("draw_cols") epi_estimate_cols = ['mean', 'lower', 'upper'] data = df.loc[:, uid_cols + draw_cols].copy() # apply formatting data.loc[df['age_group_id'].isin([33, 44, 301]), 'age_group_id'] = 235 data = dft.collapse(data, by_cols=uid_cols, stub='draw') epi_df = epi_upload.format_draws_data(data) epi_df = epi_upload.convert_to_rate(epi_df, epi_estimate_cols, location_id) # Add metadata epi_df['measure'] = 'incidence' epi_df['unit_type'] = "Person*year" epi_df['extractor'] = getuser() epi_df['location_id'] = location_id # Finalize and export for me_id in epi_df['modelable_entity_id'].unique(): print("me_id " + str(me_id) + " sequela split") me_table = nd.load_me_table() bundle_id = int(me_table.loc[me_table['modelable_entity_id'].eq(me_id), 'bundle_id'].item()) this_output = epi_df.loc[epi_df['modelable_entity_id'].eq(me_id), :] this_output = epi_upload.EpiUploadDataframe(this_output).data # Save output without testing (epi formatter has already tested data per # epi specs) # add location_id to enable save_outputs this_output['location_id'] = location_id nd.save_outputs("dismod_inputs", this_output, acause, bundle_id, skip_testing=True)
def main(dataset_id, data_type_id, split_num): ''' ''' # Load input metric_dict = {'2': 'cases', '3': 'deaths'} this_dataset = md.MI_Dataset(dataset_id, 6, data_type_id) metric_name = this_dataset.metric rdp_input = manager.get_rdp_file(this_dataset, 'rdp_input') input_data = pd.read_hdf(rdp_input, 'split_{}'.format(split_num)) # rename sex_id until rdp packages names are updated input_data.rename(columns={'sex_id': 'sex'}, inplace=True) # Redistribute data where possible if not manager.needs_rdp(input_data, this_dataset): print(" no redistribution needed for ds {} type {} split {}".format( dataset_id, data_type_id, split_num)) save_worker_output(input_data, this_dataset, split_num) return (input_data) else: print(" redistributing ds {} type {} split {}".format( dataset_id, data_type_id, split_num)) # Add maps to enable RDP input_data.rename(columns={'uid': 'split_group'}, inplace=True) mapped = add_location_hierarchy_info(input_data) # RDP cannot run without location metadata, and should not run for hiv # Set aside those data skip_rdp_mask = cannot_redistribute(mapped) set_aside = mapped.loc[skip_rdp_mask, input_data.columns.tolist()] to_redistribute = mapped.loc[~skip_rdp_mask, :] # Redistribute remaining data if to_redistribute.any().any(): rdp_results = run_rdp_core(to_redistribute, this_dataset, split_num) # Recombine if set_aside.any().any(): rdp_results = rdp_results.append(set_aside, ignore_index=True) to_finalize = rdp_results else: print(" No data to redistribute. Finalizing.") to_finalize = input_data.rename(columns={'cause': 'acause'}) output_cols = md.get_uid_cols(7) # rename sex_id until rdp packages get updated output_cols = ['sex' if x == 'sex_id' else x for x in output_cols] to_finalize = cm.correct_causes(to_finalize) finalized_df = dft.collapse(to_finalize, by_cols=output_cols, stub=metric_name) # Check totals (note: because of data precision, data before and after # may not be precisely equivalent) diff = finalized_df[metric_name].sum() - input_data[metric_name].sum() assert abs(diff/input_data[metric_name].sum()) < 5, \ "Difference from input after rdp is too large" save_worker_output(finalized_df, this_dataset, split_num) return (finalized_df)
def calc_total_prevalence(df, uid_cols): ''' Calculates a prevalence "total" value to be uploaded for troubleshooting ''' sum_df = df.loc[df['me_tag'].isin([ 'primary_phase', 'controlled_phase', 'metastatic_phase', 'terminal_phase' ])] sum_df.loc[:, 'me_tag'] = "computational_total" sum_df = dft.collapse(sum_df, by_cols=uid_cols, stub='prev') return (df.append(sum_df))
def update_redistributed_acause(df, ds_instance, split_num): ''' Returns dataframe (df) after merging with maps to update cause information -- Maps: decimal cause map : used to revert cause names to decimal form cause map : used to validate output causes ''' metric_name = ds_instance.metric output_uids = md.get_uid_cols(7) output_uids = ['sex' if x == 'sex_id' else x for x in output_uids] # def manage_rdp_remnants(df, temp_folder, split_num, metric): ''' Verifies if any garbage remains after ''' # Get any codes that didn't merge and save them rdp_error = ((df['acause'].isnull() | (df['_merge'] == 'left_only')) & df[ds_instance.metric].isin([np.nan, 0])) rdp_error_list = sorted(df.loc[rdp_error, 'cause'].unique().tolist()) if len(rdp_error_list): print("The following causes are not in the cause map:") print(rdp_error_list) return (None) # Convert acause back to cancer cause code_format_updates = { # not necessary once rdp uses the map in the cancer db 'C0': 'C00', 'C1': 'C01', 'C2': 'C02', 'C3': 'C03', 'C4': 'C04', 'C4A': 'C04', 'C5': 'C05', 'C6': 'C06', 'C7': 'C07', 'C8': 'C08', 'C9': 'C09', 'neo_other': 'neo_other_cancer' } for key, value in code_format_updates.items(): df.loc[df['acause'] == key, 'acause'] = value # Merge with cause map df.rename(columns={'acause': 'cause'}, inplace=True) # No map cause_map = cm.load_rdp_cause_map(ds_instance.data_type_id) df = df.merge(cause_map, how='left', on=['coding_system', 'cause'], indicator=True) # Check that all data were mapped to cause manage_rdp_remnants(df, ds_instance.temp_folder, split_num, metric_name) # Reformat to merge data with original source df = df.loc[:, output_uids + [metric_name]] final_df = dft.collapse(df, by_cols=output_uids, stub=metric_name) return (final_df)
def reformat_input(df, ds_instance): ''' Collapse and reshape input data from standardize_format output ''' metric_name = ds_instance.metric uid_cols = md.get_uid_cols(2) wide_uid_cols = [u for u in uid_cols if 'age' not in u] uids_noCause = [u for u in uid_cols if 'cause' not in u] df.loc[df['im_frmat_id'].isnull() & df['frmat_id'].isin([9]), 'im_frmat_id'] = 9 df = md.stdz_col_formats(df) df = dft.collapse(df, by_cols=wide_uid_cols, func='sum', stub=metric_name) df = dft.wide_to_long(df, stubnames=metric_name, i=wide_uid_cols, j='age') df = df.groupby(uid_cols, as_index=False)[metric_name].sum() df[metric_name].fillna(value=0, inplace=True) df = md.stdz_col_formats(df) df = dft.make_group_id_col(df, uids_noCause, id_col='uniqid') return(df)
def apply_age_spilt_proportions(input_df, frmat_type, wgt_df, uid_cols, metric): ''' ''' # remove dataset_id if present in dataframe split_input = input_df.copy() if 'dataset_id' in split_input.columns: del split_input['dataset_id'] # merge with the age format map and get an expanded dataframe with the ages # to be split uids_noAge = [u for u in uid_cols if 'age' != u] uid_cols = uids_noAge + ['age'] marked_df = mark_ages_to_be_split(split_input, frmat_type, uid_cols, metric) to_expand = marked_df.loc[marked_df['to_expand'].eq(1), :].copy() if len(to_expand) == 0: return(split_input) # merge with expected values ("weights") to_expand.rename(columns={'age': 'split_age', 'gbd_age': 'age'}, inplace=True) to_expand = md.stdz_col_formats(to_expand) wgt_df = md.stdz_col_formats(wgt_df) weighted_df = pd.merge(to_expand, wgt_df, how='left',indicator=True) astest.test_weights(to_expand, weighted_df) # calculate proportions to_split = pp.add_proportions( weighted_df, uids_noAge+['split_age']) # adjust by proportions to_split.loc[:, 'split_value'] = to_split[metric] to_split.loc[:, metric] = to_split['proportion'] * to_split['split_value'] # collapse, then update format types of split data recombined_df = to_split.append( marked_df.loc[marked_df['to_expand'].eq(0), :].copy()) adjusted_df = dft.collapse(recombined_df, by_cols=uid_cols, func='sum', combine_cols=metric) astest.compare_pre_post_split(split_input, adjusted_df, metric) adjusted_df.loc[:, 'need_split'] = 1 pt.verify_metric_total(split_input, adjusted_df, metric, "apply age proportions") return(adjusted_df[split_input.columns.values])
def main(dataset_id, data_type_id): ''' ''' # prep_step 5 = cause_disaggregation this_dataset = md.MI_Dataset(dataset_id, 5, data_type_id) input_data = this_dataset.load_input() metric = this_dataset.metric uid_cols = md.get_uid_cols(5) input_data = input_data.loc[~input_data['age'].isin( [26, 3, 4, 5, 6, 91, 92, 93, 94]), :] # Format and add observation numbers formatted_input = prep_for_disagg(input_data.copy(), uid_cols, metric) # Disaggregate disaggregated_df = core.disaggregate_acause(formatted_input, this_dataset) # update uid columns to account for reshaped acause uid_cols = [u for u in uid_cols if 'acause' not in u] + ['acause'] # kaposi_df = core.redist_kaposi(disaggregated_df, metric, uid_cols) if data_type_id == 2: adjusted_df = core.redist_nmsc_gc(kaposi_df, metric) else: adjusted_df = kaposi_df final_df = core.map_remaining_garbage(adjusted_df, data_type_id) # run test functions and save output pt.verify_metric_total(input_data, adjusted_df, metric, "cause disaggregation module") # collapse to incorperate newly-split data output_uids = md.get_uid_cols(6) final_df = md.stdz_col_formats(final_df) final_df = dft.collapse(final_df, by_cols=output_uids, func='sum', combine_cols=metric ) # save md.complete_prep_step(final_df, this_dataset) print("Acause disaggregated")
def manage_split(df, metric_name, uid_cols, this_dataset): ''' Converts age and sex categories in the df to those used by the cancer prep process. 1) Adds obs number 2) Splits aggregated ages 3) Combines disaggregated ages 4) Splits unknown age category 5) Splits aggregated/unknown sex category ''' is_pop = bool(metric_name == "pop") df[metric_name].fillna(value=0, inplace=True) split_df = df.copy() # add observation number by group, without age uids_noAge = [c for c in uid_cols if 'age' not in c] split_df = core.add_missing_ages(split_df, uids_noAge, metric_name) obs_numbers = split_df[uids_noAge].drop_duplicates() obs_numbers['obs'] = obs_numbers.reset_index().index split_df = split_df.merge(obs_numbers) uid_cols.append('obs') # generate cause_weight if is_pop: cause_wgts = pp.gen_pop_wgts("age_wgt", df['location_id'].unique()) else: # create weights used for splitting cause_wgts = pp.create_metric_weights(split_df, uid_cols, this_dataset) # collapse to get one weight per observation. dropping redundant entries beforehand cause_wgts.drop_duplicates(subset=['obs','age','sex_id'], inplace=True) cause_wgts = dft.collapse(cause_wgts, by_cols=['obs', 'age', 'sex_id'], func='sum', combine_cols='wgt') # split if pt.has_nonStdAge(split_df): print(" splitting non-standard age...") split_df = core.split_age(dataset=split_df, wgt_df=cause_wgts, metric=metric_name, uid_cols=uid_cols) # collapse remaining under5 and 80+ ages # split_df = md.collapse_youngAndOld_ages(split_df, uid_cols, metric_name) # redistribute "unknown age" data according to the current distribution of cases/deaths if pt.has_age_unk(split_df, metric_name): print(" splitting unknown age...") # create weights split_df = core.split_unknown_age(dataset=split_df, wgt_df=cause_wgts, metric=metric_name, uid_cols=uid_cols) # check for errors. If more than 3 metric totals are greater than .0001% different, alert user at.compare_pre_post_split(split_df, df, metric_name) # split sex = 3 and sex = 9 data if pt.has_combinedSex(split_df): print(" splitting sex...") if metric_name == "pop": sex_split_prop = pp.gen_pop_wgts( "sex_wgt", df['location_id'].unique()) else: sex_split_prop = pp.create_sex_weights(cause_wgts, uid_vars=['obs', 'age'], metric=metric_name) split_df = core.split_sex(split_df, sex_split_prop, uid_cols, metric=metric_name) # collapse remaining underu5 and 80+ ages # final_df = md.collapse_youngAndOld_ages(split_df, uid_cols, metric_name) final_df = split_df.copy(deep=True) final_df = final_df.loc[~final_df['age'].isin([26]), :] # collapse to incorperate newly-split data output_uids = md.get_uid_cols(5, is_pop) final_df = dft.collapse(final_df, by_cols=output_uids, func='sum', combine_cols=metric_name ) # save and exit md.complete_prep_step(final_df, this_dataset, is_pop) return(None)
def prep_for_disagg(df, uid_cols, metric): ''' ''' df.loc[df.coding_system != "ICD9_detail", 'coding_system'] = 'ICD10' df = dft.collapse(df, by_cols=uid_cols, func='sum', stub=metric) return(df)
def submit_sr(calc_df, this_dataset): ''' Splits data based on subtotal-recalculation requirement and submits jobs as needed to recalculate subtotals. Then returns a re-combined dataset with subtotals recalculated ''' def submission_req(df, uid): ''' Returns boolean indicating whether data are to be submitted, qualified by whether subtotals are present and whether any component codes exist that could enable recalculation ''' uid_test = df[df['uniqid'].eq(uid)] meets_requirement = bool( has_subtotals(uid_test, 'orig_cause') and components_present(uid_test) ) return(meets_requirement) def output_file_func(id): ''' Function fed to get_results relative to the ''' return(get_sr_file(this_dataset, 'split_output', id[0])) # output_uids = md.get_uid_cols(3) metric_name = this_dataset.metric job_header = "cnSR_{}_{}".format(dataset_id, data_type_id) sr_input_file = get_sr_file(this_dataset, "sr_input") worker_script = utils.get_path("subtotal_recalculation_worker", process="mi_dataset") # convert components to string to enable save in hdf file uniqid_map = calc_df[output_uids + ['uniqid', 'orig_cause'] ].copy().drop_duplicates() submitted_data, unsubmitted_data = cup.split_submission_data(calc_df, group_id_col='uniqid', submission_requirement=submission_req, hdf_file=sr_input_file, regenerate_hdf=False) if len(submitted_data) == 0: final_results = unsubmitted_data else: uid_list = submitted_data['uniqid'].unique().tolist() sr_jobs = cup.generate_prep_workers(worker_script, list_of_uids=uid_list, ds_instance=this_dataset, job_header=job_header, is_resubmission=is_resubmission) output_files = cup.get_results(sr_jobs, output_file_func, parent_process_name="sr", noisy_checker=True, add_resubmission_argument=is_resubmission, wait_time=5) # Re-combine compiled results with the set-aside data, before collapsing # and testing results = pe.read_files(output_files) results.rename(columns={'cause':'orig_cause','codes_remaining':'cause'}, inplace=True) results = md.stdz_col_formats(results, additional_float_stubs='uniqid') results = results.merge(uniqid_map, how='outer', indicator=True) assert results['_merge'].isin(["both", "right_only"]).all(), \ "Error merging with uids" del results['_merge'] # entries with blank "cause" could not be corrected. replace with the # original aggregate (will be handled by cause recalculation and rdp). results.loc[results['cause'].eq(""), 'cause'] = results['orig_cause'] # drop causes that were zeroed in subtotal recalculation results['total'] = results.groupby(output_uids)[metric_name].transform(sum) results = results.loc[results['total'].ne(0) & results[metric_name].notnull(), :] final_results = results.append(unsubmitted_data) # Re-combine with data that were not split final_results = dft.collapse(final_results, by_cols=output_uids, combine_cols=this_dataset.metric) return(final_results)
def apply_procdedure_proportions(df, proportions, acause, metric_name, faux_correct): ''' Multiplies estimates by procedure proportions, adding to the dataframe a set of estimates for the number of cancer events that do not receive the given procedure -- Note: As of 2018-07-10, incidence data are adjusted after modeling and are not processed through this function, although the ability to do so remains ''' decomp_str = decomp_prefix_cols(faux_correct) print(" adjusting to avoid double-counting procedures for {}...".format( metric_name)) # Return if adjustment is unnecessary (if there is no rate id for the cause) uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols draw_cols = nd.get_columns("{}draw_cols".format(decomp_str)) type_cols = nd.get_columns('{}{}'.format(decomp_str, metric_name)) mrg_cols = [c for c in uid_cols if c != 'me_tag'] # Subset estimates to the phase wherein procedures occur if metric_name == 'prevalence': mrg_df = df.loc[df['me_tag'] == "controlled_phase", :].copy() del mrg_df['me_tag'] elif metric_name == 'incidence': mrg_df = df.copy() # For data where sequela are a fraction of the number of procedures, multiply # the procedure proportion by those fractions if metric_name == 'prevalence' and bool(sequelae_fractions(acause)): # Generate dataframe to containing the fractions fracs = pd.DataFrame().from_dict(sequelae_fractions(acause), orient='index') fracs['acause'] = acause fracs = fracs[~fracs['me_tag'].eq("procedure_sequelae")] # Merge dataframe with proportions to expand proportions['acause'] = acause props = proportions.merge(fracs) # Adjust proportions by me props[draw_cols] = props[draw_cols].multiply(props['fraction'], axis='index') del props['acause'] else: # Determine fraction of population that does not recieve the procedure props = proportions.copy() props['me_tag'] = "adjusted_controlled_phase_a" # Apply proportions to estimates # Note: may drop some data if proportions are only for estimation years mrg_df = mrg_df.merge(props, on=mrg_cols, how='inner') adj_df = mrg_df[uid_cols] evnt_wo_proc = pd.DataFrame(mrg_df[type_cols].values * mrg_df[draw_cols].values).fillna(0) evnt_wo_proc.columns = type_cols adj_df[type_cols] = evnt_wo_proc assert not adj_df.isnull().any().any( ), "Error calculating procedure proportions" # For prevalence, append the adjusted data to the rest of the estimates if metric_name == 'prevalence': sq_df = dft.collapse(adj_df, mrg_cols, combine_cols=type_cols).sort_values(mrg_cols) cntrl_df = df.loc[df['me_tag'].eq("controlled_phase"), :].merge( mrg_df[mrg_cols].drop_duplicates(), on=mrg_cols, how='inner').sort_values(mrg_cols) nosq_df = cntrl_df[mrg_cols] no_proc = pd.DataFrame(cntrl_df[type_cols].values - sq_df[type_cols].values) no_proc.columns = type_cols nosq_df[type_cols] = no_proc nosq_df['me_tag'] = "adjusted_controlled_phase" adj_df = adj_df.append(nosq_df) output_data = df.append(adj_df) # Incidence of cancers with the procedure is estimated elsewhere, so there # is no need to preserve the unadjusted data else: output_data = adj_df return (output_data[uid_cols + type_cols])