def run_neonatal_split(): ''' ''' print('running neonatal splitting...') # load input dataset this_dataset = pd.read_csv("{}/mortality_estimates.csv".format( utils.get_path("mortality_model", base_folder="storage"))) # grabbing cancer age column temporaily this_dataset = gct.cancer_age_from_age_group_id(this_dataset) this_dataset.drop("age_group_id", axis=1, inplace=True) this_dataset.rename(columns={"dataset_id": "dataset_ids"}, inplace=True) old_df = this_dataset.copy() old_df = old_df[~old_df['age'].isin([2])] this_dataset = this_dataset[this_dataset.age.isin([2])] # Split 0-4 death_df = run_split_metric( this_dataset[get_uid_columns() + ['deaths', 'pop']], 'deaths') pop_df = run_split_pop(this_dataset[get_uid_columns() + ['pop']], 'pop') # combine non-split data to death and population dataframe df_metric_final = combine_ages(old_df, death_df, 'deaths') df_pop_final = combine_ages(old_df, pop_df, 'pop') del df_metric_final['pop'] # remove pop from split_metric dataframe merge_cols = get_uid_columns() merge_cols.remove('age') merge_cols = merge_cols + ['age_group_id'] df_final = pd.merge(df_metric_final, df_pop_final, how='inner', on=merge_cols) # save file df_final.to_csv("{}/split_mortality_estimates.csv".format( utils.get_path("mortality_model", base_folder="storage"))) return (df_final) print("Neo Ages are Split.")
def compile_nonzero_floor(self, cmdf): ''' For GBD2019, new floor values were generated for cancer causes that had updated age restrictions, or was a new modeled cause. This function takes the original nonzero floor values, and appends all updated values ''' work_dir = utils.get_path(process='cod_mortality', key='nonzero_floor_workspace') orig_nzf = pd.read_csv( utils.get_path(process='cod_mortality', key='orig_nonzero_file')) # convert age_group_ids to comply with GBD's formatted_orig_nzf = self.convert_nonzero_mad(orig_nzf, cmdf) # load nonzero floor values with new age restrictions, and that were new causes # for this GBD cycle new_age_rstrct_df = pd.read_csv( '{}/nonzero_floor_new_age_restrictions.csv'.format(work_dir)) new_causes_df = pd.read_csv( '{}/nonzero_new_causes.csv'.format(work_dir)) # append all nonzero values together comp_nzf = formatted_orig_nzf.append(new_age_rstrct_df) comp_nzf = comp_nzf.append(new_causes_df) return comp_nzf
def run_noise_reduction_create_metrics(): ''' Noise reduce datapoints based off of prior generated ''' prior_dir = utils.get_path(process='cod_mortality',key='NR_prior') prior_df = pd.read_csv(prior_dir) del prior_df['sample_size'] # remove sample_size prior_df_cc = format_CoD_input.refine_by_cc_code(prior_df) prior_df_cc.to_csv(prior_dir) print('Creating noise reduction posterior...') script_path = utils.get_path(process='cod_mortality', key='noise_reduction_posterior') subprocess.call(['Rscript', script_path], shell=False)
def project_data(): ''' Runs pipeline to combine previously-selected incidence data Requires incidence data that are unique by location_id-year-sex-age-acause ''' input_file = utils.get_path("combined_incidence", process="cod_mortality") output_file = utils.get_path("projected_incidence", process="cod_mortality") df = pd.read_csv(input_file) df = project_to_special_locations(df) df = project_ihme_location_estimates(df) df.to_csv(output_file, index=False) print("incidence data projected")
def run(): ''' Runs pipeline to combine previously-selected incidence data Requires incidence data that are unique by location_id-year-sex-age-acause ''' output_file = utils.get_path("combined_incidence", process="cod_mortality") input_file = utils.get_path("prepped_incidence", process="cod_mortality") utils.ensure_dir(output_file) df = pd.read_csv(input_file) print("Combining data to one single entry per uid...") df = combine_incidence(df) df.to_csv(output_file, index=False) return (df)
def run_noise_reduction_prior(): ''' calculates CC_code then runs noise reduction ''' neonat_dir = utils.get_path(process='cod_mortality', key='neonatal_split') df = pd.read_csv(neonat_dir) split_cc_df = format_CoD_input.refine_by_cc_code(df) split_cc_dir = utils.get_path(process='cod_mortality',key='mortality_split_cc') split_cc_df.to_csv(split_cc_dir) print('creating noise reduction prior...') script_path = utils.get_path(process='cod_mortality', key='noise_reduction_prior') subprocess.call(['Rscript', script_path], shell=False) return
def get_age_frmat_map(frmat_type): ''' ''' if frmat_type == "im_frmat_id": resource = pd.read_csv(utils.get_path( 'im_frmat_map', process="mi_dataset")) elif frmat_type == "frmat_id": resource = pd.read_csv(utils.get_path( 'frmat_map', process="mi_dataset")) resource = md.stdz_col_formats( resource, additional_float_stubs=['age_specific', 'age_split']) return(resource)
def project_incidence(): ''' For each IHME location_id, projects estimates based in the input cancer rates Includes generation of national estimates from subnational estimates where national estimates are not present ''' print(" projecting data to ihme demographic specifications...") output_file = utils.get_path("projected_incidence", process="cod_mortality") input_file = utils.get_path("combined_incidence", process="cod_mortality") pop_uids = [c for c in get_uid_columns() if c != 'acause'] df = pd.read_csv(input_file) # define subset that can be projected to the IHME population df = modeled_locations.add_subnational_status(df) df = supplement_national_estimates(df) # Ensure validity of sdi_quintile df = modeled_locations.add_sdi_quintile(df, delete_existing=True) # Calculate rate of input df.loc[:, 'rate'] = df['cases'] / df['pop'] df['registry_pop'] = df['pop'] # Mark data to be projected project_to_ihme = (df['sdi_quintile'].eq(5)) df_sdi5 = df.loc[project_to_ihme, :].copy() df_other = df.loc[~project_to_ihme, :].copy() # Add IHME population to applicable uids del df_sdi5['pop'] ihme_pop = load_ihme_pop( list(df.loc[project_to_ihme, 'location_id'].unique())) df_sdi5 = df_sdi5.merge(ihme_pop) # Homogenize population by group where not applying IHME populations df_other = staging_functions.homogenize_pop(df_other, uid_cols=pop_uids) output = df_other.append(df_sdi5) # reindex to allow multiplying series # create new column index, then set that as the new index output['index'] = np.arange(len(output)) output = output.set_index('index') # Broadcast rates to the final population estimate for all locations output.loc[output['pop'].notnull() & output['rate'].notnull() & ~output['rate'].eq(np.inf), 'cases'] = output['rate'] * output['pop'] # Drop registry-specific tags output = output.drop([ 'national_registry', 'full_coverage', 'is_subnational', 'registry_pop' ], axis=1, errors='ignore') assert not output.loc[output.duplicated(get_uid_columns()), :].any().any(), \ "Duplicates exist after projection" assert not output['pop'].isnull().any(), "Missing population data" assert len(output) == len(df), "Error during estimate projection" output.to_csv(output_file, index=False) print(" data projected.") return (output)
def save_splits(modelable_entity_id, cnf_model_version_id): ''' Launch jobs to upload each of the "split" modelable entities, generated by splitting the parent modelable entity ''' def output_file_func(id): return (nd.nonfatalDataset("upload", id).get_output_file('upload')) parent_me = modelable_entity_id work_dir = get_work_dir(parent_me, cnf_model_version_id) this_step = nd.nonfatalDataset("split", parent_me) success_file = this_step.get_output_file('upload') children_mes, skip_mes, me_tag = get_me_info(modelable_entity_id, parent_me) measures = get_measures(me_tag) save_worker = utils.get_path('save_epi_worker', process='cancer_model') # Generate a list of arguments (one for each child me) description = "{}_run_{}".format(me_tag, cnf_model_version_id) save_args_template = ("--meid {meid} --meas_id {meas} --indir {input_dir}" " --cnf_run_id {cnf_rid} --desc {desc}") save_arg_list = [] for cm in children_mes: save_arg_list += [ save_args_template.format(meid=cm, meas=" ".join([str(m) for m in measures]), desc=description, input_dir="{}/{}".format(work_dir, cm), cnf_rid=cnf_model_version_id) ] # Start jobs job_dict = cluster_tools.create_jobs(script_path=save_worker, job_header="lvr_save_epi", memory_request=90, id_list=children_mes, script_args=save_arg_list, use_argparse=True, project_name="cancer") for i in job_dict: job_dict[i]['job'].launch() # Check for results job_description = str(modelable_entity_id) + " split upload" success_df = cluster_tools.wait_for_results( job_dict, jobs_description=job_description, noisy_checker=False, output_file_function=output_file_func, max_minutes=30) success = cluster_tools.validate_success(success_df, job_description) if success: success_df.to_csv(success_file, index=False) return (True) else: print("Error during split") return (False)
def run(cod_mortality_version_id): ''' Runs drop_and_check, then prepares output for cod_mortality model pipeline ''' drop_and_check = utils.get_path("drop_and_check_cod", process="cod_mortality") selected_incidence = utils.get_path("selected_incidence", process="cod_mortality") prepped_incidence = utils.get_path("prepped_incidence", process="cod_mortality") print("Selecting incidence data...") inc_id = gv.get_inc_version(cod_mortality_version_id) #pydo.do_stata(drop_and_check, arg_list=[inc_id]) print("Formatting incidence for pipeline...") df = pd.read_stata(selected_incidence) df = format_incidence_input(df) df = add_required_columns(df) df = update_populations(df) df = restrict(df) df.to_csv(prepped_incidence, index=False) return (df)
def generate_estimates(acause, location_id, cnf_model_version_id, faux_correct, is_resubmission=False): ''' Runs a subprocess that passes all arguments to the R script that calculates incidence draws ''' print('Beginning incidence estimation...') r_script = utils.get_path("calculate_incidence", process="nonfatal_model") cmd = "bash {shl} {scr} {ac} {l} {id} {fc} {rs}".format( shl=utils.get_path("r_shell"), scr=r_script, ac=acause, l=location_id, id=cnf_model_version_id, fc=faux_correct, rs=is_resubmission) subprocess.call(cmd, shell=True) return (True)
def generate_code_index(input_codes): ''' Returns an index of all possible ICD10 codes with attached number indicating order of appearance and tag for viability. "Viable" tag indicates either an official code or an unofficial code that exists in the data ''' if not isinstance(input_codes, tuple): input_codes = tuple(input_codes) # Import list of ICD10 codes and define code index code_list_path = ( utils.get_path('mi_input') + "/_resources/" + "subtotal_recalculation/list_of_official_NUMERIC_ICD10_codes.csv") ICD10_code_list = pd.read_csv(code_list_path) ICD10_code_list.sort_values(by=['ICD10_code'], inplace=True) ICD10_code_list = tuple(ICD10_code_list['ICD10_code']) ICD10_code_index = {} order_num = 1 for k in ['C', 'D']: under_10_alternate = [ '00', '01', '02', '03', '04', '05', '06', '07', '08', '09' ] for o in under_10_alternate + list(range(10, 100)): kode = '{}{}'.format(k, o) ICD10_code_index[kode] = {} ICD10_code_index[kode]['order'] = order_num if kode in ICD10_code_list or kode in input_codes: ICD10_code_index['{}{}'.format(k, o)]['viable'] = True else: ICD10_code_index[kode]['viable'] = False order_num += 1 for d in range(0, 10): kode = '{}{}.{}'.format(k, o, d) ICD10_code_index[kode] = {} ICD10_code_index[kode]['order'] = order_num if kode in ICD10_code_list or kode in input_codes: ICD10_code_index['{}{}.{}'.format(k, o, d)]['viable'] = True else: ICD10_code_index[kode]['viable'] = False order_num += 1 for e in range(0, 10): kode = '{}{}.{}{}'.format(k, o, d, e) ICD10_code_index[kode] = {} ICD10_code_index[kode]['order'] = order_num if kode in ICD10_code_list or kode in input_codes: ICD10_code_index[kode]['viable'] = True else: ICD10_code_index[kode]['viable'] = False order_num += 1 return (ICD10_code_index)
def load_package_set(df): ''' loads the package set linked to the coding system ''' code_version = df.coding_system.unique()[0] params_dir = utils.get_path('mi_dataset_resources', process="mi_dataset") package_path = params_dir + '/redistribution/packagesets_{}.dta'.format( code_version) package = pd.read_stata(package_path) assert len(package) == 1, "Incorrect number of source labels in "\ "packagesets_{}. Expected 1, got {}. Redistribution failed."\ .format(code_version, len(package)) return (package.package_set_id.unique()[0])
def run(input_data, PACKAGE_MAP, TEMP_FOLDER): ''' ''' if int(input_data['freq'].sum()) == 0: print("Data sums to zero.") return (input_data) else: input_data.loc[input_data['cause'] == "neo_other_cancer", 'cause'] = "neo_other" output_cols = [ 'registry_index', 'year_start', 'year_end', 'year_id', 'sex', 'coding_system', 'split_group', 'age', 'cause', 'freq' ] proportion_uids = [ 'dev_status', 'super_region', 'region', 'subnational_level1', 'subnational_level2', 'country', 'location_id', 'registry_index', 'year_start', 'year_end', 'year_id', 'sex', 'coding_system', 'split_group', 'age' ] residual_cause = 'ZZZ' resources_dir = utils.get_path("mi_dataset_resources", process="mi_dataset") package_folder = '{}/redistribution/{}'.format(resources_dir, PACKAGE_MAP) cause_map_file = package_folder + '/_package_map.csv' cause_map = pd.read_csv(cause_map_file) packages = get_packages(input_data, package_folder, cause_map) if len(packages) == 0: print("No packages available with which to redistribute this data.") return (input_data) start_time = time.time() prepped_data, proportion_metadata = prep_data( input_data, proportion_uids, residual_cause=residual_cause) evaluated_cause_map = evaluate_cause_restrictions(cause_map, proportion_metadata, proportion_uids) result, diagnostics = apply_packages(prepped_data, proportion_metadata, evaluated_cause_map, packages, residual_cause=residual_cause) output_data = result.merge(proportion_metadata, on='proportion_id') output_data.loc[output_data['cause'] == "neo_other", 'cause'] = "neo_other_cancer" output_data = output_data.ix[:, output_cols] output_data.loc[:, 'freq'].fillna(value=0, inplace=True) diff = output_data['freq'].sum() - input_data['freq'].sum() assert abs(diff) < 1, "Difference from input after rdp is too large" return (output_data)
def run(): ''' Finalizes data for CoD prep, then runs CoD prep's format code ''' finalized_file = utils.get_path("formatted_CoD_data", process="cod_mortality") CoD_format_script = utils.get_path("finalize_CoD_input", process="cod_mortality") input_file = utils.get_path("nonzero_floor", process="cod_mortality") df = pd.read_csv(input_file) # Ensure that there is a single entry by uid (data will not be collapsed # after this point) assert not df[df.duplicated(get_uid_cols())].any().any(), \ "Duplicate values present at input" del df['dataset_id'] df.rename(columns={'dataset_ids': 'dataset_id'}, inplace=True) df = add_subdiv(df) df = add_CoD_variables(df) df = format_CoD_variables(df) df = test_output(df) df.to_csv(finalized_file, index=False) return (df)
def get_work_dir(parent_meid, cnf_model_version_id): ''' Returns the work directory for the current split -- Inputs parent_meid : modelable_entity_id of the parent modelable_entity that is being split cnf_model_version_id : the run_id of the cancer nonfatal model that is to be split ''' work_dir = "{root}/cnf_run_{model}/{parent}".format( root=utils.get_path('epi_splits', process='cancer_model'), model=cnf_model_version_id, parent=parent_meid) return (work_dir)
def run(cod_mortality_version_id): ''' Runs merge between incidence and MIR estimates to generate mortaliy estimate output Arguments: cod_mortality_version_id : int - table id that refers to cod_mortality_version table in Cancer's database. This ID helps select the correct mir_model_version and staged_incidence_version ''' output_file = utils.get_path("mortality_estimates", process='cod_mortality') input_file = utils.get_path("projected_incidence", process="cod_mortality") df = pd.read_csv(input_file) # temporary drop exceptions. this needs to occurs in a better spot!!! drop_loc_ids = [332, 338, 339, 363, 370, 387] df = df.loc[~df['location_id'].isin(drop_loc_ids), ] df = calculate_mortality(df, cod_mortality_version_id) df = apply_recode(df) # Validate and save df.to_csv(output_file, index=False) print(" deaths calculated and recoded.") return (df)
def load_lambda_file(cnf_model_version_id): ''' Using the cnf_lambda_version_id, returns the datestamp suffix of that version ''' lambda_file_default = utils.get_path( "lambda_values", process="nonfatal_model") record = nd.get_run_record(cnf_model_version_id) lambda_version = record.at[0, 'cnf_lambda_version_id'] db_link = cdb.db_api() this_version = db_link.get_entry(table_name='cnf_lambda_version', uniq_col='cnf_lambda_version_id', val=lambda_version) suffix = str(this_version.at[0, 'date_generated']) lambda_file = lambda_file_default.replace("<date>", suffix) return(lambda_file)
def load_surv_folder(cnf_model_version_id): ''' Using the cnf_lambda_version_id, returns the datestamp suffix of that version ''' surv_folder = surv_folder = utils.get_path("relative_survival", process="nonfatal_model") record = nd.get_run_record(cnf_model_version_id) rs_version = record.at[0, 'rel_survival_version_id'] db_link = cdb.db_api() this_version = db_link.get_entry(table_name='rel_survival_version', uniq_col='rel_survival_version_id', val=rs_version) suffix = str(this_version.at[0, 'date_generated']) rs_folder = surv_folder.replace("<date>", suffix) return(rs_folder)
def get_current_mi_results(cod_mortality_version_id): ''' Returns the current/best compiled MI results ''' db_link = cdb.db_api() mor_config = db_link.get_table('cod_mortality_version') # get mir_model_version_id mir_id = mor_config.loc[mor_config['cod_mortality_version_id'] == cod_mortality_version_id, 'mir_model_version_id'][len(mor_config) - 1] print('using mir model version id...{}'.format(mir_id)) mi_path = utils.get_path(process='mir_model', key='compiled_mir_results') # replace suffix with model_version_id mi_path = mi_path.replace('<mir_model_version_id>', str(mir_id)) compiled_mi_df = pd.read_csv(mi_path) return compiled_mi_df
def run_nonzero_floor(): ''' enforces cause-, age-, sex-, and year-specific minimums on non-zero cause fractions. It allows cause fractions to be 0, but not to be arbitrarily small. ''' print('running nonzero floor regression...') db_link = cdb.db_api() nr_df = pd.read_csv(utils.get_path(process='cod_mortality', key='noise_reduced')) cause_df = db_link.get_table('cod_model_entity') cause_df = cause_df.loc[cause_df['is_active'].eq(1),['cause_id','acause']] cause_metadata = nr_df.merge(cause_df, how='left', on='acause') cause_hierarchy = get_cause_hierarchy() nonzero_floorer = ra.NonZeroFloorer(cause_metadata) df = nonzero_floorer.get_computed_dataframe(get_pop(), get_env(), cause_hierarchy) df.to_csv(<FILE PATH>) return
def manage_split(source_cid, target_cids, proportion_meids, work_dir, description): ''' Manages the split of the source_cid followed by saving of the targets, then returns boolean indication of success ''' utils.ensure_dir(work_dir) # split model d_step = utils.get_gbd_parameter('current_decomp_step') df = split_cod_model(source_cause_id=source_cid, target_cause_ids=target_cids, target_meids=proportion_meids, output_dir=work_dir, decomp_step=d_step ) print( print("Split data saved to " + work_dir + " at " + utils.display_timestamp())) # Generate a list of arguments (one for each child me) save_args_template = "--target {targ} --desc {desc} --indir {dir}" save_arg_list = [] for t in target_cids: save_arg_list += [save_args_template.format(targ=t, desc=description, dir=work_dir) ] # Start jobs header = description.replace(" ", "_") save_worker = utils.get_path("save_cod_worker", process="cancer_model") job_dict = cluster_tools.create_jobs(script_path=save_worker, job_header=description, memory_request=50, id_list=target_cids, script_args=save_arg_list, use_argparse=True, project_name="cancer") for i in job_dict: job_dict[i]['job'].launch() # Check for results job_descrip = description + " upload" success_df = cluster_tools.wait_for_results(job_dict, jobs_description=job_descrip, noisy_checker=False, max_minutes=30) success = cluster_tools.validate_success(success_df, description) return(success)
def update_repness(df): ''' Adds superceeding representative status from the override file ''' repres_update_file = utils.get_path("representativeness_override", process="staging") repres_update = pd.read_csv(repres_update_file) repres_update = repres_update[[ 'country_id', 'grouping', 'representative']] repres_update.rename( columns={'representative': 'update_rep'}, inplace=True) df = modeled_locations.add_country_id(df) df.loc[df['location_id'] == df['country_id'], 'grouping'] = 'national' df.loc[df['grouping'] != 'national', 'grouping'] = 'subnational' df = df.merge(repres_update, how='left') df.loc[df['update_rep'].notnull(), 'representative'] = df['update_rep'] df.loc[df['representative'].isnull(), 'representative'] = 0 df = df.drop(['update_rep', 'grouping'], axis=1) return(df)
def prep_data(uid_cols, data_version_id, db): ''' getting sample size which is total deaths by sex, year, loc and age + cc_code by that specific acause as well as cause fractions which is deaths/sample_size ''' print('prepping input...') df = pd.read_csv("{}/final_output.csv".format( utils.get_path("mortality_model", base_folder = "storage"))) df.loc[df['cf_raw'].isnull(), 'cf_raw'] = 0 # teporarily fill columns to 0 until post-age-sex # and post-rdp mortality values are available cf_cols = ['cf_rd', 'cf_corr'] for c in cf_cols: df[c] = 0 df = df.loc[df['acause'].ne('cc_code'),] # drop cc_code. prepped_df = add_required_columns(df, uid_cols, data_version_id, db) return(prepped_df)
def split_liver(): ''' Submits the liver-cancer-specific information to the split manager ''' # set source and targets source_cid = 417 # parent cause_id target_cids = [996, 418, 419, 420, 421] # cause_ids proportion_meids = [18763, 2470, 2471, 2472, 2473] # proportion me_ids years = list(range(1980, int(utils.get_gbd_parameter("max_year")) + 1)) description = "lvr_cncr_split" liver_model_path = utils.get_path( 'cod_splits', process='cancer_model', base_folder='workspace') work_dir = "{}/{}".format(liver_model_path, utils.display_timestamp()) # Run split success = manage_split(source_cid, target_cids, proportion_meids, work_dir, description) if success: print("All CoD liver splits uploaded. " + utils.display_timestamp()) else: print("Error during CoD splits for liver cancer")
def load_sdi_map(): ''' Loads a map of sdi quintiles by location_id ''' sdi_map = { 'Low SDI': 1, 'Low-middle SDI': 2, 'Middle SDI': 3, 'High SDI': 4, 'High-middle SDI': 5 } sdi_data = pd.read_csv(utils.get_path("sdi_quintiles")) sdi_data.rename(columns={'sdi_quintile': 'sdi_quintile_name'}, inplace=True) sdi_data = sdi_data.loc[sdi_data['sdi_quintile_name'].notnull(), :] sdi_data['sdi_quintile'] = sdi_data['sdi_quintile_name'].apply( lambda x: sdi_map[x]) sdi_data = modeled_locations.add_country_id(sdi_data) sdi_data.loc[sdi_data['country_id'].eq(62), 'sdi_quintile'] = 4 sdi_data.loc[sdi_data['country_id'].eq(90), 'sdi_quintile'] = 5 return (sdi_data[['location_id', 'sdi_quintile']])
def add_location_hierarchy_info(df): ''' Returns the dataframe (df) with added location information: region, super_region, subnational_status, etc. Stops RDP if there is a problem with the location information ''' print(" Adding location information.") input_len = len(df) # Reformat/convert variables and ages loc_info_dir = utils.get_path('mi_dataset_resources', process="mi_dataset") loc_info_path = loc_info_dir + '/redistribution/location_hierarchy.dta' location_hierarchy = pd.read_stata(loc_info_path) location_hierarchy = location_hierarchy[[ 'location_id', 'dev_status', 'super_region', 'region', 'country', 'subnational_level1', 'subnational_level2' ]] df = df.merge(location_hierarchy, how='left', on='location_id') assert not df.location_id.isnull().any(), \ "Cannot redistribute. Unmapped location ids present." assert len( df) == input_len, "ERROR: data lost while adding location metadata" return (df)
def _get_upload_path(table): """Get the path to write the file to.""" upload_path = utils.get_path(key='cod_upload', process='cod_mortality') return upload_path
def submit_sr(calc_df, this_dataset): ''' Splits data based on subtotal-recalculation requirement and submits jobs as needed to recalculate subtotals. Then returns a re-combined dataset with subtotals recalculated ''' def submission_req(df, uid): ''' Returns boolean indicating whether data are to be submitted, qualified by whether subtotals are present and whether any component codes exist that could enable recalculation ''' uid_test = df[df['uniqid'].eq(uid)] meets_requirement = bool( has_subtotals(uid_test, 'orig_cause') and components_present(uid_test) ) return(meets_requirement) def output_file_func(id): ''' Function fed to get_results relative to the ''' return(get_sr_file(this_dataset, 'split_output', id[0])) # output_uids = md.get_uid_cols(3) metric_name = this_dataset.metric job_header = "cnSR_{}_{}".format(dataset_id, data_type_id) sr_input_file = get_sr_file(this_dataset, "sr_input") worker_script = utils.get_path("subtotal_recalculation_worker", process="mi_dataset") # convert components to string to enable save in hdf file uniqid_map = calc_df[output_uids + ['uniqid', 'orig_cause'] ].copy().drop_duplicates() submitted_data, unsubmitted_data = cup.split_submission_data(calc_df, group_id_col='uniqid', submission_requirement=submission_req, hdf_file=sr_input_file, regenerate_hdf=False) if len(submitted_data) == 0: final_results = unsubmitted_data else: uid_list = submitted_data['uniqid'].unique().tolist() sr_jobs = cup.generate_prep_workers(worker_script, list_of_uids=uid_list, ds_instance=this_dataset, job_header=job_header, is_resubmission=is_resubmission) output_files = cup.get_results(sr_jobs, output_file_func, parent_process_name="sr", noisy_checker=True, add_resubmission_argument=is_resubmission, wait_time=5) # Re-combine compiled results with the set-aside data, before collapsing # and testing results = pe.read_files(output_files) results.rename(columns={'cause':'orig_cause','codes_remaining':'cause'}, inplace=True) results = md.stdz_col_formats(results, additional_float_stubs='uniqid') results = results.merge(uniqid_map, how='outer', indicator=True) assert results['_merge'].isin(["both", "right_only"]).all(), \ "Error merging with uids" del results['_merge'] # entries with blank "cause" could not be corrected. replace with the # original aggregate (will be handled by cause recalculation and rdp). results.loc[results['cause'].eq(""), 'cause'] = results['orig_cause'] # drop causes that were zeroed in subtotal recalculation results['total'] = results.groupby(output_uids)[metric_name].transform(sum) results = results.loc[results['total'].ne(0) & results[metric_name].notnull(), :] final_results = results.append(unsubmitted_data) # Re-combine with data that were not split final_results = dft.collapse(final_results, by_cols=output_uids, combine_cols=this_dataset.metric) return(final_results)
def run(dataset_id, data_type_id, uid): ''' Preps data for recalculation then recalculates as necessary ''' this_dataset = md.MI_Dataset(dataset_id, 2, data_type_id) dataset_name = this_dataset.name metric = this_dataset.metric input_file = run_sr.get_sr_file(this_dataset, "sr_input") # Exit if output already exists output_file = run_sr.get_sr_file(this_dataset, 'split_output', uid) print(output_file) if os.path.exists(output_file): print(" output file found for uid " + str(uid)) return (None) # negative_data_ok = is_exception(dataset_id, data_type_id) error_folder = utils.get_path("mi_input", base_folder='j_temp') subcause_issue_file = '{}/subcause_issue/{}_{}_uid_{}.txt'.format( error_folder, dataset_name, data_type_id, uid) exception_file = '{}/negative_data/{}_{}_uid_{}.csv'.format( error_folder, dataset_name, data_type_id, uid) for d in [subcause_issue_file, exception_file, error_folder]: utils.ensure_dir(d) # print(" removing subtotals from uid {}...".format(uid)) # add data for the given uid df = pd.read_hdf(input_file, 'split_{}'.format(uid)) # Create a list of possible codes so that decimal subcauses are only added # if available input_cause_list = sorted(df['orig_cause'].unique().tolist()) # create a dictionary for codes in the selected uid and attach the uid's # data uid_subset = {} input_data = {} # process decimals first and ranges last to ensure that nested causes are # removed for c in sorted(df['orig_cause'].unique().tolist()): uid_subset[c] = {} input_data[c] = {} uid_subset[c]['codes'] = [] uid_subset[c]['subcodes'] = [] if "-" not in c and "," not in c: uid_subset[c]['codes'].append(c) # add subcodes to 'subcode' key df.loc[df['orig_cause'].eq(c), 'cause'].dropna().unique().tolist() for subcode in sorted(df['cause'].where( df['orig_cause'] == c).dropna().unique().tolist()): if subcode != c: uid_subset[c]['subcodes'].append(subcode) # if none of the subcodes appear in the list, set the cause as a # subcode of itself (prevents the addition of unused decimal # causes) if not len(uid_subset[c]['subcodes']): uid_subset[c]['subcodes'] = uid_subset[c]['codes'] elif (not any('{}.'.format(sub[:3]) in check for check in input_cause_list for sub in uid_subset[c]['subcodes'])): uid_subset[c]['subcodes'] = uid_subset[c]['codes'] else: for code in sorted(df['cause'].where( df['orig_cause'].eq(c)).dropna().unique().tolist()): uid_subset[c]['codes'].append(code) uid_subset[c]['subcodes'].append(code) # create other lists associated with the cause and add the metric data uid_subset[c]['subcauses_remaining'] = [] uid_subset[c]['codes_removed'] = [] uid_subset[c]['causes_removed'] = [] uid_subset[c]['data'] = df.loc[df['cause'].eq(c), ['age', metric]].set_index('age') input_data[c]['data'] = uid_subset[c]['data'] input_data[c]['codes'] = uid_subset[c]['codes'] # Determine subcauses and highest number of causes remaining (how many # subcauses are contained within each cause) uid_set = set_subcauses(uid_subset, subcause_issue_file) highest_level = determine_highest_level(uid_set) # remove lowest level codes from parent causes if highest_level == 0: print(' no subcauses present.') else: subcauses_removed = True while subcauses_removed: uid_set, subcauses_removed = remove_subcauses( uid_set, uid, exception_file) # remove duplicates uid_set = remove_duplicates(uid_set) # re-set subcauses and num_subcause_remaining uid_set, highest_level = set_subcauses( uid_set, subcause_issue_file, ) print(" subcauses removed.") # Prepare Output print("saving output...") output = pd.DataFrame( columns=['cause', 'codes_remaining', 'codes_removed', 'age', metric]) for c in uid_set: # format cause information cause_data = pd.DataFrame( columns=['cause', 'codes_remaining', 'codes_removed']) cause_data.loc[0, ['cause']] = c # if nothing was removed, or there was only a single cause, or all of # the input codes are still present, set the codes remaining as the # cause if (not len(uid_set[c]['codes_removed']) or ("-" not in c and "," not in c) or set(input_data[c]['codes']) <= set(uid_set[c]['codes'])): cause_data.loc[0, ['codes_remaining']] = c else: cause_data.loc[0, ['codes_remaining']] = ','.join( convert_to_range(uid_set[c]['codes'])) cause_data.loc[0, ['codes_removed']] = ','.join( convert_to_range(uid_set[c]['codes_removed'])) # format output data output_data = uid_set[c]['data'] output_data['age'] = output_data.index output_data['cause'] = c orig_data = input_data[c]['data'] orig_data['age'] = orig_data.index orig_data = orig_data.rename(columns={metric: 'orig_metric_value'}) orig_data['cause'] = c # combine and add to output final = pd.merge(output_data, cause_data, on='cause') final = pd.merge(final, orig_data, on=['cause', 'age']) output = output.append(final) # Create output dataset output['uniqid'] = uid # Update encoding (bug fix to work around pandas to_stata issue) output = md.stdz_col_formats(output, additional_float_stubs='uniqid') # Export results output.to_csv(output_file, index=False) print('\n Done!') time.sleep(1) return (None)