def prep_cause_to_weight_cause_map(cause_set_id, gbd_round_id, weight_causes, level_of_analysis='cause_id'): """Get the right distribution to use based on those available. Defaults to most detailed parent cause of each cause id in the given hierarchy that is in the weight casues list, unless specific exceptions are coded. """ if level_of_analysis == 'cause_id': weight_cause_map = prep_child_to_available_parent_map( cause_set_id, gbd_round_id, weight_causes) weight_cause_map = weight_cause_map.rename( columns={'parent_cause_id': 'weight_cause_id'}) causes = db_queries.get_cause_metadata(cause_set_id=cause_set_id, gbd_round_id=gbd_round_id) acauses = causes[['cause_id', 'acause']].set_index('cause_id').to_dict()['acause'] paths = causes[[ 'cause_id', 'path_to_top_parent' ]].set_index('cause_id').to_dict()['path_to_top_parent'] weight_cause_map['path_to_top_parent'] = \ weight_cause_map['cause_id'].map(paths) weight_cause_map.loc[weight_cause_map['cause_id'] == 843, 'weight_cause_id'] = 344 weight_cause_map.loc[weight_cause_map['cause_id'].isin([743, 919]), 'weight_cause_id'] = 294 weight_cause_map.loc[ weight_cause_map['path_to_top_parent'].str.contains(',366,'), 'weight_cause_id'] = 366 weight_cause_map.loc[ weight_cause_map['cause_id'].isin([855, 854, 851]), 'weight_cause_id'] = 730 weight_cause_map.loc[weight_cause_map['cause_id'] == 940, 'weight_cause_id'] = 716 weight_cause_map['acause'] = weight_cause_map['cause_id'].map(acauses) weight_cause_map['weight_acause'] = \ weight_cause_map['weight_cause_id'].map(acauses) else: weight_cause_map = pd.DataFrame({ level_of_analysis: weight_causes, 'weight_{}'.format(level_of_analysis): weight_causes }) return weight_cause_map[[ level_of_analysis, 'weight_{}'.format(level_of_analysis) ]]
def split_out_by_cause_type(df): causes = get_cause_metadata(cause_set_id=4) original_shape = df.copy().shape[0] original_deaths = df.copy()['best'].sum() war_causes = [945] war_shock_causes = list( causes[causes['parent_id'].isin(war_causes)]['cause_id']) war_df = df[df['cause_id'].isin(war_shock_causes + war_causes)] codem_causes_no_detail = [302, 345, 408, 703] codem_shock_causes = list( causes[causes['parent_id'].isin(codem_causes_no_detail)]['cause_id']) codem_shock_causes += [ 335, 357, 387, 695, 699, 703, 707, 711, 727, 842, 854, 724, 689, 341, 693 ] codem_df = df[df['cause_id'].isin(codem_shock_causes + codem_causes_no_detail)] non_codem_causes = [729] non_codem_shock_causes = list( causes[causes['parent_id'].isin(non_codem_causes)]['cause_id']) non_codem_df = df[df['cause_id'].isin(non_codem_shock_causes + non_codem_causes)] assert original_shape == (war_df.shape[0] + codem_df.shape[0] + non_codem_df.shape[0]) assert np.isclose( original_deaths, (war_df.best.sum() + codem_df.best.sum() + non_codem_df.best.sum())) return war_df, codem_df, non_codem_df
def select_maternal_data(df): """ Function that filters out non maternal data. Meant to be ran at the start of this process. If we are only interested in adjusting the maternal denom, then we don't need non maternal data. Parameters: df: Pandas DataFrame Must have 'bundle_id' as a column """ assert "bundle_id" in df.columns, "'bundle_id' must be a column." # get causes causes_5 = get_cause_metadata( cause_set_id=9, gbd_round_id=5) # round 5 was updated to keep bundles 79 and 646 causes_4 = get_cause_metadata(cause_set_id=9, gbd_round_id=4) causes = pd.concat([causes_4, causes_5]) causes.drop_duplicates(inplace=True) # create condiational mask that selects maternal causes condition = causes.path_to_top_parent.str.contains("366") # subset just causes that meet the condition sdf maternal_causes = causes[condition] # make list of maternal causes maternal_list = list(maternal_causes['cause_id'].unique()) # get bundle to cause map bundle_cause = query("QUERY", conn_def=DATABASE) # merge cause_id onto data df = df.merge(bundle_cause, how='left', on='bundle_id') # keep only maternal causes in df df = df[df['cause_id'].isin(maternal_list)] # drop cause_id df.drop('cause_id', axis=1, inplace=True) # drop the denominator bundle df = df[df['bundle_id'] != 1010] return df
def _get_hierarchy(self) -> pd.DataFrame: hierarchy_cols = [ constants.Columns.CAUSE_ID, constants.Columns.ACAUSE, constants.Columns.LEVEL, constants.Columns.PARENT_ID, constants.Columns.SORT_ORDER, constants.Columns.MOST_DETAILED, constants.Columns.IS_ESTIMATE ] hierarchy = get_cause_metadata( cause_set_version_id=self.set_version_id, gbd_round_id=self.gbd_round_id) self._validate_hierachy(hierarchy) return hierarchy[hierarchy_cols]
def get_cause_ids(cause_set): ########################################## #Fetches list of source and target causes #given the oldCorrect cause set ########################################## cause_df = get_cause_metadata(cause_set) detail_bool = cause_df['most_detailed'] == 1 sources_bool = (cause_df['parent_id'] == 952) & detail_bool targets_bool = (cause_df['parent_id'] == 953) & detail_bool sources = cause_df.loc[sources_bool, 'cause_id'].unique().tolist() targets = cause_df.loc[targets_bool, 'cause_id'].unique().tolist() return sources, targets
def run_cod_age_sex_splitting(db): # CHECK COMPLETENESS cause_set_version = 269 cm = get_cause_metadata(cause_set_version_id=cause_set_version) possible_causes = cm['cause_id'].unique().tolist() for cause_id in db['cause_id'].unique().tolist(): assert cause_id in possible_causes, "Cause ID {} not in hierarchy".format( cause_id) loc_meta = get_location_metadata(gbd_round_id=5, location_set_id=21) possible_locs = loc_meta['location_id'].tolist() db = db.loc[db['location_id'].isin(possible_locs), :] db = db.loc[db['best'] > 0, :] db['hi_best_ratio'] = db['high'] / db['best'] db['lo_best_ratio'] = db['low'] / db['best'] db = db.reset_index(drop=True) db['unique_join'] = db.index db_merge_later = db.loc[:, ['unique_join', 'hi_best_ratio', 'lo_best_ratio']] db = db.drop(labels=['high', 'low', 'hi_best_ratio', 'lo_best_ratio'], axis=1) id_cols = [ i for i in db.columns if i not in ['best', 'age_group_id', 'sex_id'] ] cause_set_version_id = query("""SELECT cause_set_version_id FROM ADDRESS WHERE gbd_round_id=5 AND cause_set_id=4;""", conn_def='epi').iloc[0, 0] pop_run_id = get_population(gbd_round_id=5, status="recent")['run_id'].iloc[0] splitter = AgeSexSplitter(cause_set_version_id=cause_set_version, pop_run_id=104, distribution_set_version_id=29, id_cols=['unique_join'], value_column='best') split_db = splitter.get_computed_dataframe(df=db, location_meta_df=loc_meta) split_db = pd.merge(left=split_db, right=db_merge_later, on=['unique_join'], how='left') split_db['low'] = split_db['best'] * split_db['lo_best_ratio'] split_db['high'] = split_db['best'] * split_db['hi_best_ratio'] split_db = split_db.drop( labels=['unique_join', 'lo_best_ratio', 'hi_best_ratio'], axis=1) return split_db
def format_gbd_results(int_cause): rd = pd.read_csv( f"/ihme/cod/prep/mcod/process_data/{int_cause}/rdp/2019_03_07/redistributed_deaths.csv" ) rd[[x for x in list(rd) if "inj" in x]] = rd[[x for x in list(rd) if "inj" in x]].fillna(0) rd = rd.groupby(['location_id', 'sex_id', 'year_id', 'age_group_id'], as_index=False)[[x for x in list(rd) if "inj" in x]].sum() rd = pd.melt(rd, id_vars=['location_id', 'sex_id', 'year_id', 'age_group_id'], value_vars=[x for x in list(rd) if "inj" in x], var_name="acause", value_name=int_cause) rd = rd.loc[rd[f"{int_cause}"] != 0] causes = get_cause_metadata(gbd_round_id=6, cause_set_id=3) injuries = causes.loc[(causes.acause.str.contains("inj")) & (causes.most_detailed == 1)] inj_dict = injuries.set_index("acause")["cause_id"].to_dict() rd["cause_id"] = rd[["acause"]].apply(lambda x: x.map(inj_dict)) restricted_targets = [729, 945] # should have been dropped last year (not most detailed/is yld only) restricted_targets += [704, 941] # x59 only unintentional if int_cause == "x59": restricted_targets += [721, 723, 725, 726, 727, 854, 941] rd = rd.loc[~(rd["cause_id"].isin(restricted_targets))] rd = get_country_names(rd) # make this right after dropping restricted targets rd = rd.groupby( ['location_id', 'sex_id', 'year_id', 'age_group_id', 'cause_id'], as_index=False)[f"{int_cause}"].sum() rd["prop"] = rd.groupby( ["age_group_id", "sex_id", "location_id", "year_id"], as_index=False)[f"{int_cause}"].transform( lambda x: x / float(x.sum(axis=0))) return rd
def get_cause_ids(self, cause_level): """ Returns a list of cause ids at a certain level of cause hierarchy, or, alternatively, all most detailed causes. In order for cause decomposition to work properly these causes must satisfy the classic GBD 'mutually exclusive and collectively exhaustive' rules for cause lists. """ ch = get_cause_metadata(cause_set_id=self.cause_set_id, gbd_round_id=self.gbd_round_id, decomp_step=self.decomp_step) validations.validate_cause_level(cause_level, ch) if cause_level == "most_detailed": cause_ids = ch.loc[ch[cause_level] == 1, "cause_id"].unique().tolist() else: cause_ids = ch[((ch["level"] == cause_level) | ( (ch["level"] < cause_level) & (ch["most_detailed"] == 1)))]["cause_id"].unique().tolist() print(f"Found {len(cause_ids)} cause_ids, cause_set_id " f"{self.cause_set_id} at cause level {cause_level}") return cause_ids
def run_cod_age_sex_splitting(df, conn_def, cause_set_version_id, pop_run_id): cause_metadata = get_cause_metadata( cause_set_version_id=cause_set_version_id) possible_causes = cause_metadata['cause_id'].unique().tolist() for cause_id in df['cause_id'].unique().tolist(): assert cause_id in possible_causes, "Cause ID {} not in hierarchy".format( cause_id) loc_meta = get_location_metadata(gbd_round_id=6, location_set_id=21) possible_locs = loc_meta['location_id'].tolist() df = df.loc[df['location_id'].isin(possible_locs), :] df = df.loc[df['best'] > 0, :] df['hi_best_ratio'] = df['high'] / df['best'] df['lo_best_ratio'] = df['low'] / df['best'] df = df.reset_index(drop=True) df['unique_join'] = df.index df_merge_later = df.loc[:, ['unique_join', 'hi_best_ratio', 'lo_best_ratio']] df = df.drop(labels=['high', 'low', 'hi_best_ratio', 'lo_best_ratio'], axis=1) splitter = AgeSexSplitter(cause_set_version_id=cause_set_version_id, pop_run_id=pop_run_id, distribution_set_version_id=62, id_cols=['unique_join'], value_column='best') split_df = splitter.get_computed_dataframe(df=df, location_meta_df=loc_meta) split_df = pd.merge(left=split_df, right=df_merge_later, on=['unique_join'], how='left') split_df['low'] = split_df['best'] * split_df['lo_best_ratio'] split_df['high'] = split_df['best'] * split_df['hi_best_ratio'] split_df = split_df.drop( labels=['unique_join', 'lo_best_ratio', 'hi_best_ratio'], axis=1) return split_df
def fix_maternal_denominators(df, return_only_maternal=False): asfr = get_covariate_estimates(QUERY) # keep age/location/year and the critical mean_value asfr = asfr[['location_id', 'year_id', 'age_group_id', 'sex_id', 'mean_value']] asfr.drop_duplicates(inplace=True) # map age_start and age_end onto asfr age_group = query("QUERY") pre_asfr = asfr.shape[0] asfr = asfr.merge(age_group, how='left', on='age_group_id') assert pre_asfr == asfr.shape[0],\ "The merge duplicated rows unexpectedly" asfr.drop('age_group_id', axis=1, inplace=True) asfr.rename(columns={'age_group_years_start': 'age_start', 'age_group_years_end': 'age_end'}, inplace=True) # create year_start and year_end asfr['year_start'] = asfr['year_id'] asfr['year_end'] = asfr['year_id'] asfr.drop('year_id', axis=1, inplace=True) # all the mean_values in asfr where age_end is less than one are 0, so we # can make up an asfr group for age start = 0 and age_end = 1 asfr.loc[asfr['age_end'] < 1, 'age_end'] = 1 asfr.loc[asfr['age_start'] < 1, 'age_start'] = 0 asfr.loc[asfr['age_end'] > 1, 'age_end'] = asfr.loc[asfr['age_end'] > 1, 'age_end'] - 1 # one more change, asfr has the max age end as 125 (now 124), and we want # it to be 99 asfr.loc[asfr['age_end'] == 124, 'age_end'] = 99 # now asfr age_start # and age_end match our hospital data # and incase we created duplicated rows by doing this: asfr.drop_duplicates(inplace=True) # MERGE ASFR ONTO HOSP pre_shape = df.shape[0] df = df.merge(asfr, how='left', on=['age_start', 'age_end', 'year_start', 'year_end', 'location_id', 'sex_id']) assert df.mean_value.isnull().sum() != df.shape[0],\ "The merge failed to attach any mean_values" assert pre_shape == df.shape[0],\ "The merge duplicated rows unexpectedly" # GET MATERNAL CAUSES # query causes causes = get_cause_metadata(QUERY) condition = causes.path_to_top_parent.str.contains("366") maternal_causes = causes[condition] # make list of maternal causes maternal_list = list(maternal_causes['cause_id'].unique()) maternal_df = df[df['cause_id'].isin(maternal_list)] # subset out rows that # are in maternal list assert maternal_df.shape[0] != 0,\ "The maternal dataframe is empty" df = df[~df['cause_id'].isin(maternal_list)] # subset out rows that # are not in the maternal list assert df.shape[0] != 0,\ "The hospital dataframe is empty" for cause in maternal_list: maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] =\ maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] /\ maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value'] maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] =\ maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] /\ maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value'] maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] =\ maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] /\ maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value'] # some mean_valued were zero, this is effectively an age/sex restriction # assign these a rate of 0 maternal_df.loc[(maternal_df['product'].isnull()) & (maternal_df['cause_id'] == cause), ['product', 'upper_product', 'lower_product']] = 0 # assign infinite values to 0 maternal_df.loc[(np.isinf(maternal_df['product'])) & (maternal_df['cause_id'] == cause), ['product', 'upper_product', 'lower_product']] = 0 if return_only_maternal == True: maternal_df.drop(['mean_value', 'cause_id'], axis=1, inplace=True) return(maternal_df) else: df = pd.concat([df, maternal_df]) # bring data back together # DROP ASFR info df.drop(['mean_value', 'cause_id'], axis=1, inplace=True) return(df)
def write_bundles(df, write_location="test", write_fixed_maternal=False, extra_filename=""): assert write_location == 'test' or write_location == 'work', ( "parameter " "write_location needs to be either 'test' or 'write', " "you put {}").format(write_location) if write_location == 'work': warnings.warn(""" write_location is set to work. """) time.sleep(5) # drop bundle 'total_maternal', we don't want to write it df = df[df.bundle_id != 1010] # get injuries bundle_ids so we can keep injury corrected data later pc_injuries = pd.read_csv("FILEPATH") inj_bid_list = pc_injuries['Level1-Bundle ID'].unique() # CAUSE INFORMATION # get cause_id so we can write to an acause # have to go through cause_id to get to a relationship between BID & # acause cause_id_info = query("QUERY") # get acause acause_info = query("QUERY") # merge acause, bid, cause_id info together acause_info = acause_info.merge(cause_id_info, how="left", on="cause_id") # REI INFORMATION # get rei_id so we can write to a rei rei_id_info = query("QUERY") # get rei rei_info = query("QUERY") # merge rei, bid, rei_id together into one dataframe rei_info = rei_info.merge(rei_id_info, how="left", on="rei_id") # COMBINE REI AND ACAUSE # rename acause to match acause_info.rename(columns={ 'cause_id': 'cause_rei_id', 'acause': 'acause_rei' }, inplace=True) # rename rei to match rei_info.rename(columns={ 'rei_id': 'cause_rei_id', 'rei': 'acause_rei' }, inplace=True) # concat rei and acause together folder_info = pd.concat([acause_info, rei_info]) # drop rows that don't have bundle_ids folder_info = folder_info.dropna(subset=['bundle_id']) # drop cause_rei_id, because we don't need it for getting data into # folders folder_info.drop("cause_rei_id", axis=1, inplace=True) # drop duplicates, just in case there are any folder_info.drop_duplicates(inplace=True) # MERGE ACAUSE/REI COMBO COLUMN ONTO DATA BY BUNDLE ID # there are NO null acause_rei entries! df = df.merge(folder_info, how="left", on="bundle_id") if write_fixed_maternal: # this is basically just a double check that we're only writing # data for maternal causes # GET MATERNAL CAUSES causes = get_cause_metadata(cause_set_id=9) condition = causes.path_to_top_parent.str.contains("366") # subset just causes that meet the condition sdf maternal_causes = causes[condition] # make list of maternal causes maternal_list = list(maternal_causes['acause'].unique()) # keep only maternal causes df = df[df['acause_rei'].isin(maternal_list)] # drop the denominator bundle df = df[df['bundle_id'] != 1010] start = time.time() bundle_ids = df['bundle_id'].unique() # prevalence, indicence should be lower case df['measure'] = df['measure'].str.lower() readme = pd.read_excel("FILEPATH") columns_before = df.columns ordered = [ 'seq', 'input_type', 'underlying_nid', 'nid', 'source_type', 'bundle_id', 'bundle_name', 'location_id', 'location_name', 'sex', 'year_start', 'year_end', 'age_start', 'age_end', 'measure', 'mean_0', 'lower_0', 'upper_0', 'mean_1', 'lower_1', 'upper_1', 'correction_factor_1', 'mean_2', 'lower_2', 'upper_2', 'correction_factor_2', 'mean_3', 'lower_3', 'upper_3', 'correction_factor_3', 'mean_inj', 'lower_inj', 'upper_inj', 'correction_factor_inj', 'standard_error', 'cases', 'effective_sample_size', 'sample_size', 'unit_type', 'unit_value_as_published', 'uncertainty_type', 'uncertainty_type_value', 'representative_name', 'urbanicity_type', 'recall_type', 'recall_type_value', 'sampling_type', 'response_rate', 'design_effect', 'extractor', 'is_outlier', 'acause_rei' ] df = df[ordered] columns_after = df.columns assert set(columns_after) == set(columns_before),\ "the columns {} were added/lost while changing column order"\ .format(set(columns_after).symmetric_difference(set(columns_before))) # adjust min age_end to 0.999 instead of 1 df.loc[df['age_start'] == 0, 'age_end'] = 0.999 print("BEGINNING WRITING, THE START TIME IS {}".format( time.strftime('%X %x %Z'))) failed_bundles = [] # initialize empty list to append to in this for loop counter = 0 # initialize counter to report how close we are to done length = len(bundle_ids) for bundle in bundle_ids: counter += 1 completeness = float(counter) / length * 100 print r"{}% done".format(completeness) # subset bundle data df_b = df[df['bundle_id'] == bundle].copy() # drop columns based on measure - inc/prev/injury # if the measure is prev: keep all 3 correction factors # if measure is inc and not an inj: keep 2 correction factors # if measure is inc and an inj: keep only injury correction factor df_b_measure = df_b.measure.unique()[0] if df_b.bundle_id.isin(inj_bid_list).all(): df_b.drop([ 'mean_1', 'upper_1', 'lower_1', 'correction_factor_1', 'mean_2', 'upper_2', 'lower_2', 'correction_factor_2', 'mean_3', 'upper_3', 'lower_3', 'correction_factor_3' ], axis=1, inplace=True) if ((df_b_measure == 'incidence') and not (df_b.bundle_id.isin(inj_bid_list).all())): df_b.drop([ 'mean_inj', 'upper_inj', 'lower_inj', 'correction_factor_inj' ], axis=1, inplace=True) if df_b_measure == 'prevalence': df_b.drop([ 'mean_inj', 'upper_inj', 'lower_inj', 'correction_factor_inj' ], axis=1, inplace=True) acause_rei = str(df_b.acause_rei.unique()[0]) df_b.drop('acause_rei', axis=1, inplace=True) if write_location == 'test': writedir = ("FILEPATH") elif write_location == 'work': writedir = ("FILEPATH") if not os.path.isdir(writedir): os.makedirs(writedir) # make the directory if it does not exist # write for modelers # make path vers_id = "v8" # last one was v6, should have been v7 date = datetime.datetime.today().strftime("%Y_%m_%d") # YYYY-MM-DD if write_fixed_maternal: extra_filename = "_adjusted_denominator" bundle_path = "{}{}_{}_{}{}.xlsx".\ format(writedir, int(bundle), vers_id, date, extra_filename) print r"Now writing at {}".format(bundle_path) # try to write to modelers' folders try: writer = pd.ExcelWriter(bundle_path, engine='xlsxwriter') df_b.to_excel(writer, sheet_name="extraction", index=False) readme.to_excel(writer, sheet_name='README', index=False) writer.save() except: failed_bundles.append(bundle) # if it fails for any reason # make note of it end = time.time() text = open("FILEPATH") text.write("function: write_bundles " + "\n" + "start time: " + str(start) + "\n" + " end time: " + str(end) + "\n" + " run time: " + str((end - start) / 60.0) + " minutes") text.close() print("DONE WRITING, THE CURRENT TIME IS {}".format( time.strftime('%X %x %Z'))) return (failed_bundles)
def get_cause_hierarchy(cause_set_id=4): ''' returns current cause hierarchy ''' causes_df = get_cause_metadata(cause_set_version_id = cause_set_id) return causes_df
def fix_maternal_denominators(df, return_only_maternal=False): # At this point, data will have bundle_id and cause_id on it, # but it has not been collapsed to those levels. it is at the # baby seq level but as of 4-24-2017 data will be at bundle level # 2) aquired asfr from the database # 3) attach age_start and age_end to asfr, and create year_start and # year_end out of year_id # 4) attach asfr to the hospital data # 5) where cause_id is a maternal cause, do the division # 6) Then drop all the asfr info. namely, 'mean_value' # GET ASFR # has age/location/year asfr = get_covariate_estimates(covariate_id=13) # keep age/location/year and the critical mean_value asfr = asfr[[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'mean_value' ]] asfr.drop_duplicates(inplace=True) # map age_start and age_end onto asfr age_group = query("QUERY") pre_asfr = asfr.shape[0] asfr = asfr.merge(age_group, how='left', on='age_group_id') assert pre_asfr == asfr.shape[0],\ "The merge duplicated rows unexpectedly" asfr.drop('age_group_id', axis=1, inplace=True) asfr.rename(columns={ 'age_group_years_start': 'age_start', 'age_group_years_end': 'age_end' }, inplace=True) # create year_start and year_end asfr['year_start'] = asfr['year_id'] asfr['year_end'] = asfr['year_id'] asfr.drop('year_id', axis=1, inplace=True) # The below commented out line of code was very wrong, asfr has three under # under one years old age groups, but our data is just 0-1 years old. # additionaly, this would turn age ends like 1 into zero, which is wrong. # asfr['age_end'] = asfr['age_end'] - 1 # all the mean_values in asfr where age_end is less than one are 0, so we # can make up an asfr group for age start = 0 and age_end = 1 asfr.loc[asfr['age_end'] < 1, 'age_end'] = 1 asfr.loc[asfr['age_start'] < 1, 'age_start'] = 0 # THIS IS SO IMPORTANT, our data has # age_end as 14, 19, 24, while asfr has age_end as 15, 20, 25 ... asfr.loc[asfr['age_end'] > 1, 'age_end'] = asfr.loc[asfr['age_end'] > 1, 'age_end'] - 1 # one more change, asfr has the max age end as 125 (now 124), and we want # it to be 99 asfr.loc[asfr['age_end'] == 124, 'age_end'] = 99 # now asfr age_start # and age_end match our hospital data # and incase we created duplicated rows by doing this: asfr.drop_duplicates(inplace=True) # MERGE ASFR ONTO HOSP pre_shape = df.shape[0] df = df.merge(asfr, how='left', on=[ 'age_start', 'age_end', 'year_start', 'year_end', 'location_id', 'sex_id' ]) assert df.mean_value.isnull().sum() != df.shape[0],\ "The merge failed to attach any mean_values" assert pre_shape == df.shape[0],\ "The merge duplicated rows unexpectedly" # GET MATERNAL CAUSES # query causes causes = get_cause_metadata(cause_set_id=9) condition = causes.path_to_top_parent.str.contains("366") # 366 happens # to always be in the third level # subset just causes that meet the condition sdf maternal_causes = causes[condition] # make list of maternal causes maternal_list = list(maternal_causes['cause_id'].unique()) # subset out parts of data that have asfr info # loop over cause_ids that are in maternal_list # divide 'mean' by 'mean_value' and overwrite mean, upper, or lower, # as relevant. maternal_df = df[df['cause_id'].isin( maternal_list)] # subset out rows that # are in maternal list assert maternal_df.shape[0] != 0,\ "The maternal dataframe is empty" df = df[~df['cause_id'].isin(maternal_list)] # subset out rows that # are not in the maternal list assert df.shape[0] != 0,\ "The hospital dataframe is empty" for cause in maternal_list: # the line breaks are weird looking but this is just assiging a value # to the result of division maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] =\ maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] /\ maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value'] maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] =\ maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] /\ maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value'] maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] =\ maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] /\ maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value'] # some mean_valued were zero, this is effectively an age/sex restriction # assign these a rate of 0 maternal_df.loc[(maternal_df['product'].isnull()) & (maternal_df['cause_id'] == cause), ['product', 'upper_product', 'lower_product']] = 0 # assign infinite values to 0 maternal_df.loc[(np.isinf(maternal_df['product'])) & (maternal_df['cause_id'] == cause), ['product', 'upper_product', 'lower_product']] = 0 if return_only_maternal == True: maternal_df.drop(['mean_value', 'cause_id'], axis=1, inplace=True) return (maternal_df) else: df = pd.concat([df, maternal_df]) # bring data back together # DROP ASFR info df.drop(['mean_value', 'cause_id'], axis=1, inplace=True) return (df)
def prep_child_to_available_parent_map(cause_set_id, gbd_round_id, available_cause_ids, as_dict=False): """Prep a mapping of cause_id to the most detailed available parent. For a given cause hierarchy, and a list of "available" causes, return a mapping from each cause in the hierarchy to the most detailed available cause that is "available" and in that cause's path_to_top_parent. "available": icg_id/cause_id are present in the weights dataframe Arguments: cause_set_id (int): from shared.cause_set in the database gbd_round_id (int): from shared.gbd_round in the database together, cause_set_id and gbd_round_id determine the active cause set version id to use from shared.cause_hierarchy_history available_cause_ids (list of ints): all must be cause ids in shared.cause as_dict (bool): If False, returns a dataframe instead of a dict Returns: cause_map (dict): a dictionary from cause_id to available_cause_id or, if as_dict == False: a dataframe ['cause_id', 'parent_cause_id'] This function isn't used if the data that is being split is at the icg_id level. """ causes = db_queries.get_cause_metadata(cause_set_id=cause_set_id, gbd_round_id=gbd_round_id) cause_levels = causes.path_to_top_parent.str.split(',').apply(pd.Series, 1) cause_tree = pd.concat( [causes[['cause_id', 'path_to_top_parent', 'level']], cause_levels], axis=1) cause_tree = cause_tree.drop(['path_to_top_parent', 'level'], axis=1) cause_tree = cause_tree.set_index(['cause_id']).stack().reset_index() cause_tree = cause_tree.rename(columns={ 'level_1': 'par_level', 0: 'parent_cause_id' }) cause_tree['parent_cause_id'] = cause_tree['parent_cause_id'].astype(int) cause_availability = {c: 1 for c in available_cause_ids} cause_tree['available'] = \ cause_tree['parent_cause_id'].map(cause_availability).fillna(0) available_cause_map = cause_tree.query('available == 1') available_cause_map['max_level_available'] = \ available_cause_map.groupby('cause_id')['par_level'].transform(max) available_cause_map = available_cause_map.query( 'par_level == max_level_available') available_cause_map = available_cause_map[['cause_id', 'parent_cause_id']] assert not available_cause_map[['cause_id']].duplicated().any() missing = set(causes.cause_id) - set(available_cause_map['cause_id']) if len(missing) > 0: raise AssertionError( "Was not able to find parent in given available cause " "ids list for these cause ids: \n{}".format(missing)) if as_dict: available_cause_map = available_cause_map.set_index('cause_id') available_cause_map = available_cause_map.to_dict()['parent_cause_id'] return available_cause_map
def aggregate_to_bundle(df, write_maternal_denom=False, adjust_maternal_denom=False): """ Takes a dataframe aggregated to baby sequela level and returns the df aggregated to 5 year bands at the bundle level """ ############################## # PULL POPULATION # # This is preparation for aggregating to 5 year bands ############################## # get rid of all corrected values and correction factors. we want to do # this because we are going to aggregate to 5 year bands and the bundle_id # level of analysis. We intend to makes cells where the correction factor # is over 50 NULL. it doesn't make sense to do that before aggregating, # because any rows with nulls in them would be lost entirely. It doesn't # make sense to apply corrections and NOT delete cells that break the # over-50 rule before aggregation either cuz we would lose the correction # factors during the groupby. Also, it would not be robust to aggregate # a 16 columns. Therefore, it's best to apply corrections after aggregating. # Technically, the groupby would get rid of these for us. Honestly, this # just makes things easier to work with up to that point. df.drop(['mean_1', 'mean_2', 'mean_3', 'mean_inj', 'upper_1', 'upper_2', 'upper_3', 'upper_inj', 'lower_1', 'lower_2', 'lower_3', 'lower_inj', 'correction_factor_1', 'correction_factor_2', 'correction_factor_3', 'correction_factor_inj'], axis=1, inplace=True) if write_maternal_denom: df = df[df['bundle_id'] == 1010] # drop the non maternal data pronto if adjust_maternal_denom: # GET MATERNAL CAUSES causes = get_cause_metadata(cause_set_id=9) condition = causes.path_to_top_parent.str.contains("366") # 366 happens # to always be in the third level # subset just causes that meet the condition sdf maternal_causes = causes[condition] # make list of maternal causes maternal_list = list(maternal_causes['cause_id'].unique()) # get bundle to cause map bundle_cause = query("QUERY") # merge cause_id onto data df = df.merge(bundle_cause, how='left', on='bundle_id') # keep only maternal causes df = df[df['cause_id'].isin(maternal_list)] # drop cause_id df.drop('cause_id', axis=1, inplace=True) # drop the denominator bundle df = df[df['bundle_id'] != 1010] # pull age_group to age_start/age_end map ############################## # GET POPULATION ############################## age_group = query("QUERY") # correct age groups age_group.loc[age_group['age_group_years_end'] > 1, 'age_group_years_end']\ = age_group.\ loc[age_group['age_group_years_end'] > 1, 'age_group_years_end'] - 1 # df of unique ages from hospital data to merge onto age_group map df_ages = pd.DataFrame([df.age_start.unique(), df.age_end.unique()]).transpose() df_ages.columns = ['age_group_years_start', 'age_group_years_end'] df_ages = df_ages.merge(age_group, how='left', on=['age_group_years_start', 'age_group_years_end']) # this is the correct terminal age group (even though we use max age = 99) df_ages.loc[df_ages['age_group_years_start'] == 95, 'age_group_id'] = 235 # there are two age_group_ids for age_start=0 and age_start=1 df_ages = df_ages[df_ages.age_group_id != 161] # create age/year/location lists to use for pulling population age_list = list(df_ages.age_group_id.unique()) loc_list = list(df.location_id.unique()) year_list = list(df.year_start.unique()) # pull population and merge on age_start and age_end pop = get_population(age_group_id=age_list, location_id=loc_list, sex_id=[1, 2], year_id=year_list) # attach age_start and age_end to population information pop = pop.merge(age_group, how='left', on='age_group_id') pop.drop(['process_version_map_id', 'age_group_id'], axis=1, inplace=True) # rename pop columns to match hospital data columns pop.rename(columns={'age_group_years_start': 'age_start', 'age_group_years_end': 'age_end', 'year_id': 'year_start'}, inplace=True) pop['year_end'] = pop['year_start'] # correct terminal age group to match our data pop.loc[pop['age_end'] == 124, 'age_end'] = 99 demography = ['location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id'] ############################## # MAKE DATA SQUARE ############################## # create a series of sorted, non-zero mean values to make sure # the func doesn't alter anything check_mean = df.loc[df['mean_0'] > 0, 'mean_0'].sort_values().\ reset_index(drop=True) print("Starting number of rows: {}".format(df.shape[0])) # square the dataset df = hosp_prep.make_zeroes(df, level_of_analysis='bundle_id', cols_to_square=['mean_0', 'upper_0', 'lower_0'], icd_len=5) # assert the sorted means are identical assert (check_mean == df.loc[df['mean_0'] > 0, 'mean_0'].sort_values().\ reset_index(drop=True)).all() # delete rows where restrictions should be applied # create df where baby sequela are missing missing_nfc = df[df.nonfatal_cause_name.isnull()].copy() df = df[df.nonfatal_cause_name.notnull()] for col in ['mean_0', 'upper_0', 'lower_0']: df = hosp_prep.apply_restrictions(df, col) missing_nfc = hosp_prep.apply_bundle_restrictions(missing_nfc, col) df = df[df[col].notnull()] df = pd.concat([df, missing_nfc]) print("Square number of rows: {}".format(df.shape[0])) # don't create the parent injury dupes until after the data is totally # square so that the denominators will match # df = get_parent_injuries(df) # check_parent_injuries(df, 'mean_0') pre_shape = df.shape[0] # store for before comparison # then merge population onto the hospital data df = df.merge(pop, how='left', on=demography) # attach pop info to hosp assert pre_shape == df.shape[0], "number of rows don't match after merge" ############################## # RATE SPACE TO COUNTS ############################## # go from rate space to additive counts # because of careful merging we just need to multiply across columns df['hosp_count'] = df['mean_0'] * df['population'] df['upper_hosp_count'] = df['upper_0'] * df['population'] df['lower_hosp_count'] = df['lower_0'] * df['population'] # merge on "denominator" from file that was made back in # create_cause_fractions. This adds the denominator column, which is the # number of admissions in a demographic group df = df.merge(pd.read_csv("FILEPATH"), how='left', on=["age_start", "age_end", "sex_id", "year_start", "year_end", "location_id"]) # 5 year bins df = hosp_prep.year_binner(df) # add 5 year NIDs onto data df = hosp_prep.five_year_nids(df) ################################################################## ################################################################## # THE COLLAPSE # ################################################################## ################################################################## # The final collapse to 5 year bands, 5 year nids and bundle ID # this is doing a few things at once. One is that we need to aggregate # to 5 year bands. Another is aggregating to the Bundle_id level of # analysis. Up to this point we were at the nonfatal_cause_name AKA # baby sequelae level of analysis. # WE NEED TWO COLLAPSES. One for data that doesn't have a sample_size value, # and annother for the data that does. This is because: # https://goo.gl/e66OZ4 and https://goo.gl/Fb78xi # make df of data where there is full coverage (i.e., the UK) full_coverage_sources = ["UK_HOSPITAL_STATISTICS"] # make condition mask that indicates rows that have full coverage has_full_coverage = df.source.isin(full_coverage_sources) covered_df = df[has_full_coverage] # drop "denominator" from covered_df covered_df = covered_df.drop("denominator", axis=1) # drop this data from the main dataframe df = df[~has_full_coverage] assert (df.loc[df.denominator.isnull(), 'mean_0'] == 0).all(), ("mean_0" " should be 0") assert (df.loc[df.denominator.isnull(), 'lower_0'] == 0).all(), ("lower_0" " should be 0") assert (df.loc[df.denominator.isnull(), 'upper_0'] == 0).all(), ("upper_0" " should be 0") df = df[df.denominator.notnull()] # df already has sample size df.drop("sample_size", axis=1, inplace=True) # check if cases are lost in the groupby # rename "denominator" to "sample_size" in df (not covered_df) df.rename(columns={"denominator": "sample_size"}, inplace=True) pre_cases = df['hosp_count'].sum() # can use the same group columns for both dataframes groups = ['location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id', 'nid', 'representative_id', 'bundle_id'] # sample_size has some null values from being made square, but it was just # population, so we're using pop instead. so remember, # population == sample_size for covered_df covered_df = covered_df.groupby(groups)\ .agg({'hosp_count':'sum', 'population': 'sum'}).reset_index() # add "sample_size" to the aggregate function df = df.groupby(groups).agg({'hosp_count': 'sum', 'upper_hosp_count': 'sum', 'lower_hosp_count': 'sum', 'population': 'sum', 'sample_size': 'sum'}).reset_index() assert round(pre_cases, 0) == round(df['hosp_count'].sum(), 0),\ ("some cases were lost. " "From {} to {}".format(pre_cases, df['hosp_count'].sum())) # set sample size to np.nan when mean/upper/lower are greater than 0 df.loc[(df['hosp_count'] > 0) & (df['lower_hosp_count'] > 0) & (df['upper_hosp_count'] > 0), 'sample_size'] = np.nan ############################## # COUNTS TO RATE SPACE ############################## # REMAKE mean and uncertainty # for the main df: df['mean_0'] = df['hosp_count'] / df['population'] df['lower_0'] = df['lower_hosp_count'] / df['population'] df['upper_0'] = df['upper_hosp_count'] / df['population'] df.drop(['hosp_count', 'lower_hosp_count', 'upper_hosp_count'], axis=1, inplace=True) # add parent injuries # NOTE get parent injuries is ran before covered_df is concated with df. # it happens to not make a difference at the moment, because there are not # injuries in covered_df, but it could in the future. df = get_parent_injuries(df) for col in ['mean_0', 'lower_0', 'upper_0']: hosp_prep.check_parent_injuries(df, col_to_sum=col) # this drops the population that was merged on for coverting to counts. df.drop('population', axis=1, inplace=True) # don't need pop anymore # for the covered df: covered_df['mean_0'] = covered_df['hosp_count'] / covered_df['population'] covered_df.rename(columns={"population": "sample_size"}, inplace=True) # drop columns covered_df.drop(['hosp_count'], axis=1, inplace=True) ############################### # RE-ATTACH ############################### # bring covered_df and df together. # where we have full coverage, lower and upper should be null # mean_0 will never be null df = pd.concat([df, covered_df], ignore_index=True) # assert what we just said will be true in the comments above: assert df.loc[has_full_coverage, 'lower_0'].isnull().all(), ("where we have" " full coverage, lower_0 should be null") assert df.loc[has_full_coverage, 'upper_0'].isnull().all(), ("where we have" " full coverage, upper_0 should be null") assert df.mean_0.notnull().all(), ("mean_0 should never be null") # NOTE, remember, sample size will still have null values, and that's okay # we need to keep sample size from here on out. if "population" in df.columns: print "population was still in columns" df.drop("population", axis=1, inplace=True) ######################################## # map measure onto data, just need this for ELMO reqs. clean_maps = pd.read_csv("FILEPATH") clean_maps = clean_maps[['bundle_id', 'bid_measure']] clean_maps.drop_duplicates(inplace=True) clean_maps.rename(columns={'bid_measure': 'measure'}, inplace=True) # remove null bundles from map clean_maps = clean_maps[clean_maps.bundle_id.notnull()] pre_shape = df.shape[0] # store for comparison after merge # merge measure onto hosp data using bundle_id df = df.merge(clean_maps, how='left', on='bundle_id') assert pre_shape == df.shape[0], "number of rows don't match after merge." # get injuries bids so we can check for missing measures pc_injuries = pd.read_csv("FILEPATH") inj_bids = pc_injuries['Level1-Bundle ID'].unique() # some injuries bids didn't get measures! assert set(df[df.measure.isnull()].bundle_id).issubset(set(inj_bids)),\ ("We expect that all null measures belong to injuries, but that is" "not the case. Something went wrong!") # fix any injuries that are missing measure, all inj are inc: df.loc[(df.measure.isnull())&(df.bundle_id.isin(inj_bids)), 'measure'] = 'inc' assert df.measure.isnull().sum() == 0, ("There are null values and we " "expect none") # read in correction factors (again) correction_factors = pd.read_csv("FILEPATH") correction_factors.drop("outpatient", axis=1, inplace=True) # rename columns to match df correction_factors.rename(columns={'sex': 'sex_id'}, inplace=True) # merge corr factors onto data df = df.merge(correction_factors, how='left', on=['age_start', 'sex_id', 'bundle_id']) assert pre_shape == df.shape[0] , ("You unexpectedly added rows while " "merging on the correction factors. Don't do that!") # if a Bundle ID doesn't have a corr factor from marketscan use 1 # df.update(df[['a','b','c']].fillna(0)) # test code # http://stackoverflow.com/questions/36556256/how-do-i-fill-na-values-in-multiple-columns-in-pandas df.update(df[['indv_cf', 'incidence', 'prevalence', 'injury_cf']].fillna(1)) # rename correction factors to match what we told people they would be df.rename(columns={'indv_cf': 'correction_factor_1', 'incidence': 'correction_factor_2', 'prevalence': 'correction_factor_3', 'injury_cf': 'correction_factor_inj'}, inplace=True) # NOTE we apply every correction factor to all data, even if it is not # relevant. E.g., not all data is injuries, so not all data needs # correction_factor_inj. It simply easier to apply all of them, and then # while writing to modeler's folders, drop the irrelevant columns. # make mean_1, lower_1, upper_1 df['mean_1'] = df.correction_factor_1 * df.mean_0 df['lower_1'] = df.correction_factor_1 * df.lower_0 df['upper_1'] = df.correction_factor_1 * df.upper_0 # make mean_2, lower_2, upper_2 df['mean_2'] = df.correction_factor_2 * df.mean_0 df['lower_2'] = df.correction_factor_2 * df.lower_0 df['upper_2'] = df.correction_factor_2 * df.upper_0 # make mean_3, lower_3, upper_3 df['mean_3'] = df.correction_factor_3 * df.mean_0 df['lower_3'] = df.correction_factor_3 * df.lower_0 df['upper_3'] = df.correction_factor_3 * df.upper_0 # make injury mean, lower, upper df['mean_inj'] = df.correction_factor_inj * df.mean_0 df['lower_inj'] = df.correction_factor_inj * df.lower_0 df['upper_inj'] = df.correction_factor_inj * df.upper_0 # assert what we just said will be true in the comments above: levels=["1", "2", "3", "inj"] for level in levels: assert df.loc[has_full_coverage, 'lower_{}'.format(level)].isnull().all(), ("broke on level {}".format(level)) assert df.loc[has_full_coverage, 'upper_{}'.format(level)].isnull().all(), ("broke on level {}".format(level)) assert df["mean_{}".format(level)].notnull().all(), ("broke on level {}".format(level)) def factor_applier(df, levels=["1", "2", "3", "inj"]): for level in levels: # initialize cols df['test_mean_' + level] = np.nan df['test_lower_' + level] = np.nan df['test_upper_' + level] = np.nan # apply corr factors df['test_mean_' + level], df['test_lower_' + level], df['test_upper_' + level] = df["correction_factor_" + level] * df['mean_0'], df["correction_factor_" + level] * df['lower_0'], df["correction_factor_" + level] * df['upper_0'] return(df) df = factor_applier(df) levels=["1", "2", "3", "inj"] for level in levels: assert (df.loc[df["mean_" + level].notnull(), "mean_" + level] == df.loc[df["test_mean_" + level].notnull(), "test_mean_" + level]).all(), ("different on level {}".format(level)) assert (df.loc[df["upper_" + level].notnull(), "upper_" + level] == df.loc[df["test_upper_" + level].notnull(), "test_upper_" + level]).all(), ("different on level {}".format(level)) assert (df.loc[df["lower_" + level].notnull(), "lower_" + level] == df.loc[df["test_lower_" + level].notnull(), "test_lower_" + level]).all(), ("different on level {}".format(level)) # drop test cols for now until we run this for awhile without # tripping the assert test_cols = df.columns[df.columns.str.startswith("test_")] df.drop(test_cols, axis=1, inplace=True) # RULE = if correction factor is greater than 50, make the data null # EXCEPTIONS are made for these bundles, which are capped at 100: # Preterm: 80, 81, 82, 500 # Encephalopathy: 338 # Sepsis: 92 # Hemoloytic: 458 # PAD/PUD: 345 # Cirrhosis: 131 # list of bundles which can have correction factors above 50 cf_exceptions = [345, 80, 81, 82, 500, 338, 92, 458, 131] # NOTE when checking the number of nulls, consider the nulls that are caused # by the sample_size split def mean_capper(df, exceptions, levels=["1", "2", "3"]): exception_condition = df.bundle_id.isin(cf_exceptions) for level in levels: df.loc[(~exception_condition) & (df['correction_factor_' + level] > 50), ['mean_' + level, 'lower_' + level, 'upper_' + level, ]] = np.nan df.loc[(exception_condition) & (df['correction_factor_' + level] > 100), ['mean_' + level, 'lower_' + level, 'upper_' + level, ]] = np.nan return(df) # create df to test function df_test_capper = mean_capper(df, cf_exceptions) exception_condition = df.bundle_id.isin(cf_exceptions) # make boolean mask that says if a bundle is in the list df.loc[(~exception_condition)&(df.correction_factor_1 > 50), ['mean_1', 'lower_1', 'upper_1']] = [np.nan, np.nan, np.nan] # DOESN'T DO ANYTHING don't really need to apply here df.loc[(~exception_condition)&(df.correction_factor_2 > 50), ['mean_2', 'lower_2', 'upper_2']] = [np.nan, np.nan, np.nan] # DID DO SOMETHING, affects 6% of prev rows, 0.01% of inc rows df.loc[(~exception_condition)&(df.correction_factor_3 > 50), ['mean_3', 'lower_3', 'upper_3']] = [np.nan, np.nan, np.nan] # DOES A LOT, affects 57% percent of prevalence rows df.loc[(exception_condition)&(df.correction_factor_1 > 100), ['mean_1', 'lower_1', 'upper_1']] = [np.nan, np.nan, np.nan] df.loc[(exception_condition)&(df.correction_factor_2 > 100), ['mean_2', 'lower_2', 'upper_2']] = [np.nan, np.nan, np.nan] df.loc[(exception_condition)&(df.correction_factor_3 > 100), ['mean_3', 'lower_3', 'upper_3']] = [np.nan, np.nan, np.nan] ##################################################### # CHECK that lower < mean < upper ##################################################### # loop over every level of correction # can't compare null values, null comparisons always eval to False for i in ["0", "1", "2", "3", 'inj']: # lower < mean assert (df.loc[df['lower_'+i].notnull(), 'lower_'+i] <= df.loc[df["lower_"+i].notnull(), 'mean_'+i]).all(),\ "lower_{} should be less than mean_{}".format(i, i) # mean < lower assert (df.loc[df["upper_"+i].notnull(), 'mean_'+i] <= df.loc[df["upper_"+i].notnull(), 'upper_'+i]).all(),\ "mean_{} should be less than upper_{}".format(i, i) # compare the results between test df and proper df for uncertainty in ["mean", "upper", "lower"]: for level in ["1", "2", "3"]: # compare the sum of nulls rows between dfs assert df[uncertainty + "_" + level].isnull().sum() ==\ df_test_capper[uncertainty + "_" + level].isnull().sum(),\ "The new capping function is producing different results" # write the maternal denominator data, this is for the future when we work # in parallel if write_maternal_denom: def write_maternal_denom(df): mat_df = df[df.bundle_id==1010].copy() mat_df = mat_df.query("sex_id == 2 & age_start >=10 & age_end <=54") if mat_df.shape[0] == 0: return # NOTE sample size is dropped here, and we make a new one in the # following code mat_df = mat_df[['location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id', 'mean_0', 'mean_1', 'mean_2', 'mean_3']].copy() bounds = ['upper_0', 'upper_1', 'upper_2', 'upper_3', 'lower_0', 'lower_1', 'lower_2', 'lower_3'] for uncertainty in bounds: mat_df[uncertainty] = np.nan # PREP FOR POP #################################################### # we don't have years that we can merge on pop to yet, because # we aggregated to year bands; another problem with this method mat_df['year_id'] = mat_df.year_start + 2 # makes 2000,2005,2010 # bunch of age things so we can use age_group_id to get pop age_group = query("QUERY") # correct age groups age_group.loc[age_group['age_group_years_end'] > 1, 'age_group_years_end'] =\ age_group.loc[age_group['age_group_years_end'] > 1, 'age_group_years_end'] - 1 # df of unique ages from hospital data to merge onto age_group map mat_df_ages = pd.DataFrame([mat_df.age_start.unique(), mat_df.age_end.unique()]).transpose() mat_df_ages.columns = ['age_group_years_start', 'age_group_years_end'] mat_df_ages = mat_df_ages.merge(age_group, how='left', on=['age_group_years_start', 'age_group_years_end']) # this is the correct terminal age group (even though we use max # age = 99) mat_df_ages.loc[mat_df_ages['age_group_years_start'] == 95, 'age_group_id'] = 235 # there are two age_group_ids for age_start=0 and age_start=1 mat_df_ages = mat_df_ages[mat_df_ages.age_group_id != 161] # create age/year/location lists to use for pulling population age_list = list(mat_df_ages.age_group_id.unique()) loc_list = list(mat_df.location_id.unique()) year_list = list(mat_df.year_id.unique()) # GET POP ######################################################## # pull population and merge on age_start and age_end pop = get_population(age_group_id=age_list, location_id=loc_list, sex_id=[1, 2], year_id=year_list) # FORMAT POP #################################################### # attach age_start and age_end to population information pop = pop.merge(age_group, how='left', on='age_group_id') pop.drop(['process_version_map_id', 'age_group_id'], axis=1, inplace=True) # rename pop columns to match hospital data columns pop.rename(columns={'age_group_years_start': 'age_start', 'age_group_years_end': 'age_end'}, inplace=True) # correct terminal age group to match our data pop.loc[pop['age_end'] == 124, 'age_end'] = 99 # MERGE POP ###################################################### demography = ['location_id', 'year_id', 'age_start', 'age_end', 'sex_id'] pre_shape = mat_df.shape[0] # store for before comparison # then merge population onto the hospital data # attach pop info to hosp mat_df = mat_df.merge(pop, how='left', on=demography) assert pre_shape == mat_df.shape[0], ("number of rows don't " "match after merge") # MAKE SAMPLE SIZE ############################################## mat_df['sample_size'] = mat_df.population * mat_df.mean_0 # DROP intermidiate columns mat_df.drop(['population', 'year_id'], axis=1, inplace=True) mat_df.to_hdf("FILEPATH", key='df', mode="w") # backup copy to _archive mat_df.to_hdf("FILEPATH", key='df', mode='w') write_maternal_denom(df) if adjust_maternal_denom: def adjust_maternal_denom(df): # drop sample_size, UTLAs already had it, but we need it for # everything, so we have to drop it. df.drop('sample_size', axis=1, inplace=True) df = df.query("sex_id == 2 & age_start >=10 & age_end <=54") # read in maternal denoms, this is needed when our process is # parallelized denom = pd.read_hdf("FILEPATH", key="df") # denom.drop('bundle_id', axis=1, inplace=True) denom_cols = sorted(denom.filter(regex="[0-9]$").columns) for col in denom_cols: denom.rename(columns={col: col + "_denominator"}, inplace=True) pre = df.shape[0] df = df.merge(denom, how='left', on=['location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id']) assert pre == df.shape[0], ("shape should not have changed " "during merge") #print(df[df.mean_0_denominator.isnull()].shape) #print(df[df.mean_0_denominator.isnull()]) df = df[(df['mean_0'] > 0) | (df['mean_0_denominator'].notnull())] assert df.mean_0_denominator.isnull().sum() == 0, ("shouldn't be " "any null values in this column") # regex to find the columns that start with l, m or u and end with # a digit num_cols = sorted(df.filter(regex="^[lmu].*[0-9]$").columns) denom_cols =\ sorted(df.columns[df.columns.str.endswith("denominator")]) # divide each bundle value by bundle 1010 to get the adjusted rate for i in np.arange(0, 12, 1): df[num_cols[i]] = df[num_cols[i]] / df[denom_cols[i]] # drop the denominator columns df.drop(denom_cols, axis=1, inplace=True) # can't divide by zero df = df[df['sample_size'] != 0] # RETURN ONLY THE MATERNAL DATA return(df) df = adjust_maternal_denom(df) return(df)
'../save_custom_results.py "{}" "{}" "{}" "{}" "{}" "{}" "{}" "{}"' .format(process_vers, 'FILEPATH{}/single_year/draws'.format(process_vers), 'draws_{year_id}_366.h5', 1990, 2019, 'sdg', 'MMR {}'.format(process_vers), 4)) subprocess.call(call, shell=True) if __name__ == '__main__': print("Initiating script.") decomp_step, gbd_round_id, conn_def = sys.argv[1:4] gbd_round_id = int(gbd_round_id) print("Getting cause metdata") cause_df = get_cause_metadata(8, gbd_round_id=gbd_round_id) print("Getting causes") # only most-detailed and root cause causes = cause_df.loc[(cause_df.most_detailed == 1) | (cause_df.level == 0)].cause_id.unique().tolist() codcorrect_vers = get_best_codcorrect_vers(decomp_step, gbd_round_id) print("setting process version") process_vers = Uploader(conn_def, codcorrect_vers, decomp_step, int(gbd_round_id)).prep_upload() process_vers = 14774 mmr_out_dir, arc_out_dir = set_out_dirs(process_vers) print("Launching save_birth_estimates") launch_save_birth_estimate_job(gbd_round_id, decomp_step, process_vers)
# Data from XX, XX, and XX were ICD-10 coded, # data from XX and XX were ICD-9 coded and # data from XX contained both ICD-9 and ICD-10 coded deaths df.groupby("location_name", as_index=False).agg( {"code_system_id": "unique"}).to_csv( "/home/j/temp/agesak/thesis/tables/icd_systems.csv", index=False) # number of injuries related deaths - need total # deaths for each source # df.groupby("location_name", as_index=False).agg({"deaths": "sum"}) # Deaths where an injuries-related ICD code was the # underlying cause of death were mapped to one of XX # most-detailed GBD injuries causes. causes = get_cause_metadata(gbd_round_id=6, cause_set_id=3) injuries = causes.loc[causes.acause.str.contains("inj")] len(injuries.query("most_detailed==1")) # Sentence: Of the XX million deaths available in these records, # XX% were injuries related, with XX% of these injuries deaths being garbage coded. # get just injuries related deaths. # Part 1: Of the XX million deaths available in these records, # could pick any int cause here df = get_mcause_data( phase='format_map', sub_dirs="sepsis", source=["TWN_MOH", "MEX_INEGI", "BRA_SIM", "USA_NVSS", "COL_DANE", "ITA_ISTAT"], verbose=True, **{"force_rerun": True, "block_rerun": False}) # total deaths
def _launch_cod_splits(source_cause_id, target_cause_ids, target_meids, prop_meas_id, gbd_round_id, decomp_step, output_dir, project): """ Split the given source_cause_id given target_meid proportions, saved to the target_cause_ids in output_dir. Arguments: source_cause_id (int): cause_id for the draws to be split target_cause_ids (intlist): list of cause ids that you want the new outputted subcauses to be identified by target_meids (intlist): list of proportion models' modelable_entity_ids that you want the source_cause_id to be split by, to make the target_cause_ids. Target_cause_ids and target_me_ids must be specified in the same order prop_meas_id (int): The measure_id that identifies the proportion in the target_meids to use for the split. gbd_round_id (int): the gbd_round_id for models being split. decomp_step (str): Specifies which decomposition step the returned estimates should be from. If using interpolate for GBD round 6 and above, must specify one of 'step1', 'step2', 'step3', 'step4', 'step5', or 'iterative'. output_dir (str): directory where you want final results stored project (str): The SGE project to launch split_cod_model subjobs to using SplitCodSwarm. Returns: A list of tuples with each location_id paired with either 0, or an error message. This is then parsed in the central function draw_ops.split_cod_model into errors or success messages """ # setup years, sex restrictions, most detailed locations, etc. if gbd_round_id >= 6: cause_set_id = COMPUTATION_CAUSE_SET_ID else: cause_set_id = REPORTING_CAUSE_SET_ID causes = get_cause_metadata( cause_set_id=cause_set_id, gbd_round_id=gbd_round_id, decomp_step=decomp_step).query("cause_id==@source_cause_id") sex_ids = [] if causes['male'].item() != 0: sex_ids.append(1) if causes['female'].item() != 0: sex_ids.append(2) if not sex_ids: raise ValueError( "Source_cause_id {} is restricted for both males and females, " "according to cause metadata".format(source_cause_id)) most_detailed_locs = list( get_location_metadata(35, gbd_round_id=gbd_round_id, decomp_step=decomp_step).query( 'most_detailed==1').location_id.unique()) meid_cause_map = dict(zip(target_meids, target_cause_ids)) # run interpolating/extrapolating intermediate_dir = os.path.join(output_dir, 'intermediate_{}'.format(source_cause_id)) if not os.path.exists(intermediate_dir): makedirs_safely(intermediate_dir) swarm = SplitCoDSwarm(source_id=source_cause_id, proportion_ids=target_meids, proportion_measure_id=prop_meas_id, sex_ids=sex_ids, gbd_round_id=gbd_round_id, decomp_step=decomp_step, intermediate_dir=intermediate_dir, outdir=output_dir, project=project) swarm.add_interpolate_tasks() exit_code = swarm.run() if exit_code != 0: raise RuntimeError( "Interpolating CoD years failed. Check logs in {}.".format( output_dir)) # run splitting for cid in target_cause_ids: cid_dir = os.path.join(output_dir, str(cid)) if not os.path.exists(cid_dir): makedirs_safely(cid_dir) file_list = glob.glob(os.path.join('{}/*.h5'.format(intermediate_dir))) # read in draws for source cause source = _get_draws(source_cause_id, gbd_round_id, decomp_step, sex_ids) # create a temporary directory to store all the draws from the source cause tmpdir = tempfile.TemporaryDirectory(dir=output_dir) # save source cause draws to temporary directory source.to_hdf( os.path.join(tmpdir.name, 'source_cause_draws.h5'), key='draws', mode='w', format='table', data_columns=['location_id', 'year_id', 'sex_id', 'age_group_id']) run_splits = functools.partial(_parallel_merge_split, meid_cause_map, file_list, output_dir, tmpdir) pool = Pool(30) res = pool.map(run_splits, most_detailed_locs) pool.close() pool.join() # clean up tempdir tmpdir.cleanup() return res