def apply_envelope_only(df, env_path, return_only_inj=False, apply_age_sex_restrictions=True, want_to_drop_data=True, create_hosp_denom=False, apply_env_subroutine=False, fix_norway_subnat=False): """ Function that converts hospital data into rates. Takes hospital data in count space, at the baby Sequelae level, computes cause fractions, attaches the hospital utilization envelope, and multiplies the envelope and the cause fractions. Data that represents fully covered populations are not made into cause fractions, nor is the envelope applied. Instead, their counts are divided by population. Returns a DataFrame that is in Rate spaces at the Baby Sequelae level. Arguments: df (DataFrame) contains hospital inpatient primary diagnosis that has been mapped to Baby Sequelae return_only_inj (bool): switch that will make it so this function only returns injuries data. should always be off, it's just there for testing/convienience. apply_age_sex_restrictions (bool): switch that if True, will apply age and sex restrictions as determined by cause_set_id=9. Defaults to true. Useful if you want to look at data all the data that was present in the source. want_to_drop_data (bool): If True will run drop_data(verbose=False) from the hosp_prep module create_hosp_denom (bool): If true create_cause_fraction() will write the denominator for use later for imputed zeros. """ ############################################### # DROP DATA ############################################### if want_to_drop_data: df = hosp_prep.drop_data(df, verbose=False) if not df.age_group_id.notnull().all(): warnings.warn("""Shouldn't be any null age_group_id, there could be unsplit ages in the data, or maybe an age_start/age_end changed""") # Make list of sources that we don't want to use envelope. This should be # the source in the 'source' column of the dataframe full_coverage_sources = ["UK_HOSPITAL_STATISTICS"] if pd.Series(full_coverage_sources).isin(df.source.unique()).any(): # make dataframe with sources full_coverage_df = df[df.source.isin(full_coverage_sources)].copy() # get denominator (aka poplation) full_coverage_df = gbd_hosp_prep.get_sample_size(full_coverage_df, fix_group237=True) full_coverage_df.rename(columns={'population': 'sample_size'}, inplace=True) # make product full_coverage_df['product'] =\ full_coverage_df.val / full_coverage_df.sample_size else: # make empty dataframe so that it exists and wont break anything full_coverage_df = pd.DataFrame(columns=list(df.columns) + ['product']) # drop these sources from dataframe for now df = df[~df.source.isin(full_coverage_sources)].copy() ######################################### # CREATE CAUSE FRACTIONS ######################################### if not apply_env_subroutine: df = hosp_prep.create_cause_fraction( df, create_hosp_denom=create_hosp_denom, store_diagnostics=True, tol=1e-6) if apply_age_sex_restrictions: df = hosp_prep.apply_restrictions(df, 'cause_fraction') if full_coverage_df.shape[0] > 0: full_coverage_df = hosp_prep.apply_restrictions( full_coverage_df, 'product') # drop the restricted values df = df[df['cause_fraction'].notnull()] full_coverage_df =\ full_coverage_df[full_coverage_df['product'].notnull()] if env_path[-3:] == "csv": env_df = pd.read_csv(env_path) else: env_df = pd.read_hdf(env_path, key='df') # split the envelope into Norway subnationals if fix_norway_subnat: # make a df of national and subnational locs nor_locs = [ 4910, 4911, 4912, 4913, 4914, 4915, 4916, 4917, 4918, 4919, 4920, 4921, 4922, 4923, 4926, 4927, 4928, 53432 ] # repeat norway's national location n times nat_locs = np.repeat(90, len(nor_locs)) nor_subnat_df = pd.DataFrame({ 'sub_location_id': nor_locs, 'location_id': nat_locs }) # merge env on by national loc nor_subnat_df = nor_subnat_df.merge(env_df, how='left', on='location_id') # set subnation loc to proper loc id nor_subnat_df.drop('location_id', axis=1, inplace=True) nor_subnat_df.rename(columns={'sub_location_id': 'location_id'}, inplace=True) # append the new locations onto the env env_df = pd.concat([env_df, nor_subnat_df], ignore_index=True) assert env_df[env_df.location_id.isin(nor_locs)].shape[0] > 5000,\ "Some of the norway subnational locations {} weren't merged on. {}".\ format(nor_locs, env_df[env_df.location_id.isin(nor_locs)].location_id.unique()) env_df.drop(['age_start', 'age_end'], axis=1, inplace=True) demography = [ 'location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id' ] ############################################ # MERGE ENVELOPE onto data ############################################ pre_shape = df.shape[0] df = df.merge(env_df, how='left', on=demography) assert pre_shape == df.shape[0],\ "The merge duplicated rows unexpectedly" # compute hospitalization rate # aka "apply the envelope" df['product'] = df['cause_fraction'] * df['mean'] df['upper_product'] = df['cause_fraction'] * df['upper'] df['lower_product'] = df['cause_fraction'] * df['lower'] ############################################ # RE-ATTACH data that has sample size ############################################ df = pd.concat([df, full_coverage_df]).reset_index(drop=True) # if we just want injuries: if return_only_inj == True: inj_bundle = pd.read_csv(root + r"FILEPATH" r"parent_child_injuries_gbd2017.csv") inj_bundle = list(inj_bundle['Level1-Bundle ID']) df = df[df['bundle_id'].isin(inj_bundle)] if apply_env_subroutine: df.drop(['cause_fraction', 'mean', 'upper', 'lower'], axis=1, inplace=True) df.rename(columns={ 'product': 'mean', 'lower_product': 'lower', 'upper_product': 'upper' }, inplace=True) else: df.drop([ 'cause_fraction', 'mean', 'upper', 'lower', 'val', 'numerator', 'denominator' ], axis=1, inplace=True) return (df)
def run_age_sex_splitting(df, split_sources=[], round_id=0, verbose=False): """ takes in dataframe of data that you want to split, and a list of sources that need splitting, and then uses age_sex_splitting.py to split. as of 6/16/2017 this function assumes that the weights you want to use are already present. There is some skeletal stuff to deal with multiple rounds for weights but it's not currently fleshed out """ # write data used for visualization if round_id == 1 or round_id == 2: # save pre_split data for viz, attach location name and bundle name and save pre_split = df[df.source.isin(split_sources)] pre_split = pre_split.merge(query(SQL QUERY), how='left', on='location_id') pre_split.drop(['year_end', 'age_group_unit', 'age_end', 'nid', 'facility_id', 'representative_id', 'diagnosis_id', 'outcome_id', 'metric_id'], axis=1, inplace=True) pre_split.to_csv("FILEPATH", index=False) # drop data and apply age/sex restrictions df = hosp_prep.drop_data(df, verbose=False) df = hosp_prep.apply_restrictions(df, col_to_restrict='val') # list of col names to compare for later use pre_cols = df.columns # we need to fix the terminal ages by source lst = [] for asource in df.source.unique(): subdf = df[df['source'] == asource].copy() old_max_age = subdf.age_end.max() # store old bad terminal age # replace terminal age end so that age_sex_split will recognize this as a "bad" age_group subdf.loc[subdf.age_end == old_max_age, 'age_end'] = 124 lst.append(subdf) del subdf df = pd.concat(lst) del lst # make age end match what is in the shared DBB df.loc[df['age_end'] > 1, 'age_end'] = df.loc[df['age_end'] > 1, 'age_end'] + 1 # get age info age_group_info = query(SQL QUERY) # this age group is wrong. age_group_info = age_group_info[age_group_info['age_group_id'] != 161] # has same age_start and end as 22, results in duplicates age_group_info = age_group_info[age_group_info['age_group_id'] != 27] age_group_info.rename(columns={"age_group_years_start": "age_start", "age_group_years_end": "age_end"}, inplace=True) # merge ages on pre = df.shape[0] df = df.merge(age_group_info, how="left", on=['age_start', 'age_end']) assert pre == df.shape[0],\ r"the number of rows changed after " r"merging on age_group_id, probably because there are " r"2 ids that share the same age_start and age_end" # drop age start/end for simplicity df.drop(['age_start', 'age_end'], axis=1, inplace=True) df_list = [] # initialize empty list to concat split dfs to # columns that uniquely identify rows of data id_cols = ['source', 'nid', 'nonfatal_cause_name', 'year_id', 'age_group_id', 'sex_id', 'location_id'] perfect_ages = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 28, 30, 31, 32, 235] rows = df.shape[0] numer = 0 # for source in split_sources: for source in df.source.unique(): # make df of one source that needs to be split splitting_df = df[df.source == source].copy() numer += splitting_df.shape[0] print(source) print("working: {}% done".format(float(numer)/rows * 100)) if verbose: print "{}'s age groups before:".format(source) print splitting_df[['age_group_id']].\ drop_duplicates().sort_values(by='age_group_id') # create year_id splitting_df['year_id'] = splitting_df['year_start'] splitting_df.drop(['year_start', 'year_end'], axis=1, inplace=True) if verbose: print "now splitting {}".format(source) # check if data needs to be split, if not append to dflist and move on if set(splitting_df.age_group_id).symmetric_difference(set(perfect_ages)) == set(): df_list.append(splitting_df) continue # the function from CoD team that does the splitting split_df = age_sex_split.split_age_sex(splitting_df, id_cols, value_column='val', level_of_analysis='nonfatal_cause_name', fix_gbd2016_mistake=False) if verbose: print( "Orig value sum {} - New value sum {} = {} \n".format( splitting_df.val.sum(), split_df.val.sum(), splitting_df.val.sum() - split_df.val.sum() ) ) pre_val = splitting_df.val.sum() post_val = split_df.val.sum() assert pre_val - post_val <= (pre_val * .0001),\ "Too many cases were lost" if verbose: print "{}'s ages after:".format(source) print split_df[['age_group_id']].\ drop_duplicates().sort_values(by='age_group_id') # append split data to our list of dataframes df_list.append(split_df) # bring the list of split DFs back together df = pd.concat(df_list).reset_index(drop=True) df = df.merge(age_group_info, how='left', on='age_group_id') # change age_end to match the rest of the data df.loc[df.age_end > 1, 'age_end'] =\ df.loc[df.age_end > 1, 'age_end'] - 1 df.loc[df.age_end == 124, 'age_end'] = 99 # check that we have the right number of age groups assert df[['age_start', 'age_end', 'age_group_id']].drop_duplicates()\ .shape[0]==21 # prepare to reattach split data to main df df['year_start'] = df['year_id'] df['year_end'] = df['year_id'] df.drop(['year_id', 'age_group_id'], axis=1, inplace=True) assert set(df.columns).symmetric_difference(set(pre_cols)) == set() # data used for visualization if round_id == 1 or round_id == 2: df = df.merge(query(SQL QUERY)) # ? df_for_tableau = df.copy() df.drop(['year_end', 'age_group_unit', 'age_end', 'nid', 'facility_id', 'representative_id', 'diagnosis_id', 'outcome_id', 'metric_id'], axis=1).to_csv( "FILEPATH", index=False) return(df)
def aggregate_to_bundle(df, write_maternal_denom=False, adjust_maternal_denom=False): """ Takes a dataframe aggregated to baby sequela level and returns the df aggregated to 5 year bands at the bundle level """ ############################## # PULL POPULATION # # This is preparation for aggregating to 5 year bands ############################## # get rid of all corrected values and correction factors. we want to do # this because we are going to aggregate to 5 year bands and the bundle_id # level of analysis. We intend to makes cells where the correction factor # is over 50 NULL. it doesn't make sense to do that before aggregating, # because any rows with nulls in them would be lost entirely. It doesn't # make sense to apply corrections and NOT delete cells that break the # over-50 rule before aggregation either cuz we would lose the correction # factors during the groupby. Also, it would not be robust to aggregate # a 16 columns. Therefore, it's best to apply corrections after aggregating. # Technically, the groupby would get rid of these for us. Honestly, this # just makes things easier to work with up to that point. df.drop(['mean_1', 'mean_2', 'mean_3', 'mean_inj', 'upper_1', 'upper_2', 'upper_3', 'upper_inj', 'lower_1', 'lower_2', 'lower_3', 'lower_inj', 'correction_factor_1', 'correction_factor_2', 'correction_factor_3', 'correction_factor_inj'], axis=1, inplace=True) if write_maternal_denom: df = df[df['bundle_id'] == 1010] # drop the non maternal data pronto if adjust_maternal_denom: # GET MATERNAL CAUSES causes = get_cause_metadata(cause_set_id=9) condition = causes.path_to_top_parent.str.contains("366") # 366 happens # to always be in the third level # subset just causes that meet the condition sdf maternal_causes = causes[condition] # make list of maternal causes maternal_list = list(maternal_causes['cause_id'].unique()) # get bundle to cause map bundle_cause = query("QUERY") # merge cause_id onto data df = df.merge(bundle_cause, how='left', on='bundle_id') # keep only maternal causes df = df[df['cause_id'].isin(maternal_list)] # drop cause_id df.drop('cause_id', axis=1, inplace=True) # drop the denominator bundle df = df[df['bundle_id'] != 1010] # pull age_group to age_start/age_end map ############################## # GET POPULATION ############################## age_group = query("QUERY") # correct age groups age_group.loc[age_group['age_group_years_end'] > 1, 'age_group_years_end']\ = age_group.\ loc[age_group['age_group_years_end'] > 1, 'age_group_years_end'] - 1 # df of unique ages from hospital data to merge onto age_group map df_ages = pd.DataFrame([df.age_start.unique(), df.age_end.unique()]).transpose() df_ages.columns = ['age_group_years_start', 'age_group_years_end'] df_ages = df_ages.merge(age_group, how='left', on=['age_group_years_start', 'age_group_years_end']) # this is the correct terminal age group (even though we use max age = 99) df_ages.loc[df_ages['age_group_years_start'] == 95, 'age_group_id'] = 235 # there are two age_group_ids for age_start=0 and age_start=1 df_ages = df_ages[df_ages.age_group_id != 161] # create age/year/location lists to use for pulling population age_list = list(df_ages.age_group_id.unique()) loc_list = list(df.location_id.unique()) year_list = list(df.year_start.unique()) # pull population and merge on age_start and age_end pop = get_population(age_group_id=age_list, location_id=loc_list, sex_id=[1, 2], year_id=year_list) # attach age_start and age_end to population information pop = pop.merge(age_group, how='left', on='age_group_id') pop.drop(['process_version_map_id', 'age_group_id'], axis=1, inplace=True) # rename pop columns to match hospital data columns pop.rename(columns={'age_group_years_start': 'age_start', 'age_group_years_end': 'age_end', 'year_id': 'year_start'}, inplace=True) pop['year_end'] = pop['year_start'] # correct terminal age group to match our data pop.loc[pop['age_end'] == 124, 'age_end'] = 99 demography = ['location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id'] ############################## # MAKE DATA SQUARE ############################## # create a series of sorted, non-zero mean values to make sure # the func doesn't alter anything check_mean = df.loc[df['mean_0'] > 0, 'mean_0'].sort_values().\ reset_index(drop=True) print("Starting number of rows: {}".format(df.shape[0])) # square the dataset df = hosp_prep.make_zeroes(df, level_of_analysis='bundle_id', cols_to_square=['mean_0', 'upper_0', 'lower_0'], icd_len=5) # assert the sorted means are identical assert (check_mean == df.loc[df['mean_0'] > 0, 'mean_0'].sort_values().\ reset_index(drop=True)).all() # delete rows where restrictions should be applied # create df where baby sequela are missing missing_nfc = df[df.nonfatal_cause_name.isnull()].copy() df = df[df.nonfatal_cause_name.notnull()] for col in ['mean_0', 'upper_0', 'lower_0']: df = hosp_prep.apply_restrictions(df, col) missing_nfc = hosp_prep.apply_bundle_restrictions(missing_nfc, col) df = df[df[col].notnull()] df = pd.concat([df, missing_nfc]) print("Square number of rows: {}".format(df.shape[0])) # don't create the parent injury dupes until after the data is totally # square so that the denominators will match # df = get_parent_injuries(df) # check_parent_injuries(df, 'mean_0') pre_shape = df.shape[0] # store for before comparison # then merge population onto the hospital data df = df.merge(pop, how='left', on=demography) # attach pop info to hosp assert pre_shape == df.shape[0], "number of rows don't match after merge" ############################## # RATE SPACE TO COUNTS ############################## # go from rate space to additive counts # because of careful merging we just need to multiply across columns df['hosp_count'] = df['mean_0'] * df['population'] df['upper_hosp_count'] = df['upper_0'] * df['population'] df['lower_hosp_count'] = df['lower_0'] * df['population'] # merge on "denominator" from file that was made back in # create_cause_fractions. This adds the denominator column, which is the # number of admissions in a demographic group df = df.merge(pd.read_csv("FILEPATH"), how='left', on=["age_start", "age_end", "sex_id", "year_start", "year_end", "location_id"]) # 5 year bins df = hosp_prep.year_binner(df) # add 5 year NIDs onto data df = hosp_prep.five_year_nids(df) ################################################################## ################################################################## # THE COLLAPSE # ################################################################## ################################################################## # The final collapse to 5 year bands, 5 year nids and bundle ID # this is doing a few things at once. One is that we need to aggregate # to 5 year bands. Another is aggregating to the Bundle_id level of # analysis. Up to this point we were at the nonfatal_cause_name AKA # baby sequelae level of analysis. # WE NEED TWO COLLAPSES. One for data that doesn't have a sample_size value, # and annother for the data that does. This is because: # https://goo.gl/e66OZ4 and https://goo.gl/Fb78xi # make df of data where there is full coverage (i.e., the UK) full_coverage_sources = ["UK_HOSPITAL_STATISTICS"] # make condition mask that indicates rows that have full coverage has_full_coverage = df.source.isin(full_coverage_sources) covered_df = df[has_full_coverage] # drop "denominator" from covered_df covered_df = covered_df.drop("denominator", axis=1) # drop this data from the main dataframe df = df[~has_full_coverage] assert (df.loc[df.denominator.isnull(), 'mean_0'] == 0).all(), ("mean_0" " should be 0") assert (df.loc[df.denominator.isnull(), 'lower_0'] == 0).all(), ("lower_0" " should be 0") assert (df.loc[df.denominator.isnull(), 'upper_0'] == 0).all(), ("upper_0" " should be 0") df = df[df.denominator.notnull()] # df already has sample size df.drop("sample_size", axis=1, inplace=True) # check if cases are lost in the groupby # rename "denominator" to "sample_size" in df (not covered_df) df.rename(columns={"denominator": "sample_size"}, inplace=True) pre_cases = df['hosp_count'].sum() # can use the same group columns for both dataframes groups = ['location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id', 'nid', 'representative_id', 'bundle_id'] # sample_size has some null values from being made square, but it was just # population, so we're using pop instead. so remember, # population == sample_size for covered_df covered_df = covered_df.groupby(groups)\ .agg({'hosp_count':'sum', 'population': 'sum'}).reset_index() # add "sample_size" to the aggregate function df = df.groupby(groups).agg({'hosp_count': 'sum', 'upper_hosp_count': 'sum', 'lower_hosp_count': 'sum', 'population': 'sum', 'sample_size': 'sum'}).reset_index() assert round(pre_cases, 0) == round(df['hosp_count'].sum(), 0),\ ("some cases were lost. " "From {} to {}".format(pre_cases, df['hosp_count'].sum())) # set sample size to np.nan when mean/upper/lower are greater than 0 df.loc[(df['hosp_count'] > 0) & (df['lower_hosp_count'] > 0) & (df['upper_hosp_count'] > 0), 'sample_size'] = np.nan ############################## # COUNTS TO RATE SPACE ############################## # REMAKE mean and uncertainty # for the main df: df['mean_0'] = df['hosp_count'] / df['population'] df['lower_0'] = df['lower_hosp_count'] / df['population'] df['upper_0'] = df['upper_hosp_count'] / df['population'] df.drop(['hosp_count', 'lower_hosp_count', 'upper_hosp_count'], axis=1, inplace=True) # add parent injuries # NOTE get parent injuries is ran before covered_df is concated with df. # it happens to not make a difference at the moment, because there are not # injuries in covered_df, but it could in the future. df = get_parent_injuries(df) for col in ['mean_0', 'lower_0', 'upper_0']: hosp_prep.check_parent_injuries(df, col_to_sum=col) # this drops the population that was merged on for coverting to counts. df.drop('population', axis=1, inplace=True) # don't need pop anymore # for the covered df: covered_df['mean_0'] = covered_df['hosp_count'] / covered_df['population'] covered_df.rename(columns={"population": "sample_size"}, inplace=True) # drop columns covered_df.drop(['hosp_count'], axis=1, inplace=True) ############################### # RE-ATTACH ############################### # bring covered_df and df together. # where we have full coverage, lower and upper should be null # mean_0 will never be null df = pd.concat([df, covered_df], ignore_index=True) # assert what we just said will be true in the comments above: assert df.loc[has_full_coverage, 'lower_0'].isnull().all(), ("where we have" " full coverage, lower_0 should be null") assert df.loc[has_full_coverage, 'upper_0'].isnull().all(), ("where we have" " full coverage, upper_0 should be null") assert df.mean_0.notnull().all(), ("mean_0 should never be null") # NOTE, remember, sample size will still have null values, and that's okay # we need to keep sample size from here on out. if "population" in df.columns: print "population was still in columns" df.drop("population", axis=1, inplace=True) ######################################## # map measure onto data, just need this for ELMO reqs. clean_maps = pd.read_csv("FILEPATH") clean_maps = clean_maps[['bundle_id', 'bid_measure']] clean_maps.drop_duplicates(inplace=True) clean_maps.rename(columns={'bid_measure': 'measure'}, inplace=True) # remove null bundles from map clean_maps = clean_maps[clean_maps.bundle_id.notnull()] pre_shape = df.shape[0] # store for comparison after merge # merge measure onto hosp data using bundle_id df = df.merge(clean_maps, how='left', on='bundle_id') assert pre_shape == df.shape[0], "number of rows don't match after merge." # get injuries bids so we can check for missing measures pc_injuries = pd.read_csv("FILEPATH") inj_bids = pc_injuries['Level1-Bundle ID'].unique() # some injuries bids didn't get measures! assert set(df[df.measure.isnull()].bundle_id).issubset(set(inj_bids)),\ ("We expect that all null measures belong to injuries, but that is" "not the case. Something went wrong!") # fix any injuries that are missing measure, all inj are inc: df.loc[(df.measure.isnull())&(df.bundle_id.isin(inj_bids)), 'measure'] = 'inc' assert df.measure.isnull().sum() == 0, ("There are null values and we " "expect none") # read in correction factors (again) correction_factors = pd.read_csv("FILEPATH") correction_factors.drop("outpatient", axis=1, inplace=True) # rename columns to match df correction_factors.rename(columns={'sex': 'sex_id'}, inplace=True) # merge corr factors onto data df = df.merge(correction_factors, how='left', on=['age_start', 'sex_id', 'bundle_id']) assert pre_shape == df.shape[0] , ("You unexpectedly added rows while " "merging on the correction factors. Don't do that!") # if a Bundle ID doesn't have a corr factor from marketscan use 1 # df.update(df[['a','b','c']].fillna(0)) # test code # http://stackoverflow.com/questions/36556256/how-do-i-fill-na-values-in-multiple-columns-in-pandas df.update(df[['indv_cf', 'incidence', 'prevalence', 'injury_cf']].fillna(1)) # rename correction factors to match what we told people they would be df.rename(columns={'indv_cf': 'correction_factor_1', 'incidence': 'correction_factor_2', 'prevalence': 'correction_factor_3', 'injury_cf': 'correction_factor_inj'}, inplace=True) # NOTE we apply every correction factor to all data, even if it is not # relevant. E.g., not all data is injuries, so not all data needs # correction_factor_inj. It simply easier to apply all of them, and then # while writing to modeler's folders, drop the irrelevant columns. # make mean_1, lower_1, upper_1 df['mean_1'] = df.correction_factor_1 * df.mean_0 df['lower_1'] = df.correction_factor_1 * df.lower_0 df['upper_1'] = df.correction_factor_1 * df.upper_0 # make mean_2, lower_2, upper_2 df['mean_2'] = df.correction_factor_2 * df.mean_0 df['lower_2'] = df.correction_factor_2 * df.lower_0 df['upper_2'] = df.correction_factor_2 * df.upper_0 # make mean_3, lower_3, upper_3 df['mean_3'] = df.correction_factor_3 * df.mean_0 df['lower_3'] = df.correction_factor_3 * df.lower_0 df['upper_3'] = df.correction_factor_3 * df.upper_0 # make injury mean, lower, upper df['mean_inj'] = df.correction_factor_inj * df.mean_0 df['lower_inj'] = df.correction_factor_inj * df.lower_0 df['upper_inj'] = df.correction_factor_inj * df.upper_0 # assert what we just said will be true in the comments above: levels=["1", "2", "3", "inj"] for level in levels: assert df.loc[has_full_coverage, 'lower_{}'.format(level)].isnull().all(), ("broke on level {}".format(level)) assert df.loc[has_full_coverage, 'upper_{}'.format(level)].isnull().all(), ("broke on level {}".format(level)) assert df["mean_{}".format(level)].notnull().all(), ("broke on level {}".format(level)) def factor_applier(df, levels=["1", "2", "3", "inj"]): for level in levels: # initialize cols df['test_mean_' + level] = np.nan df['test_lower_' + level] = np.nan df['test_upper_' + level] = np.nan # apply corr factors df['test_mean_' + level], df['test_lower_' + level], df['test_upper_' + level] = df["correction_factor_" + level] * df['mean_0'], df["correction_factor_" + level] * df['lower_0'], df["correction_factor_" + level] * df['upper_0'] return(df) df = factor_applier(df) levels=["1", "2", "3", "inj"] for level in levels: assert (df.loc[df["mean_" + level].notnull(), "mean_" + level] == df.loc[df["test_mean_" + level].notnull(), "test_mean_" + level]).all(), ("different on level {}".format(level)) assert (df.loc[df["upper_" + level].notnull(), "upper_" + level] == df.loc[df["test_upper_" + level].notnull(), "test_upper_" + level]).all(), ("different on level {}".format(level)) assert (df.loc[df["lower_" + level].notnull(), "lower_" + level] == df.loc[df["test_lower_" + level].notnull(), "test_lower_" + level]).all(), ("different on level {}".format(level)) # drop test cols for now until we run this for awhile without # tripping the assert test_cols = df.columns[df.columns.str.startswith("test_")] df.drop(test_cols, axis=1, inplace=True) # RULE = if correction factor is greater than 50, make the data null # EXCEPTIONS are made for these bundles, which are capped at 100: # Preterm: 80, 81, 82, 500 # Encephalopathy: 338 # Sepsis: 92 # Hemoloytic: 458 # PAD/PUD: 345 # Cirrhosis: 131 # list of bundles which can have correction factors above 50 cf_exceptions = [345, 80, 81, 82, 500, 338, 92, 458, 131] # NOTE when checking the number of nulls, consider the nulls that are caused # by the sample_size split def mean_capper(df, exceptions, levels=["1", "2", "3"]): exception_condition = df.bundle_id.isin(cf_exceptions) for level in levels: df.loc[(~exception_condition) & (df['correction_factor_' + level] > 50), ['mean_' + level, 'lower_' + level, 'upper_' + level, ]] = np.nan df.loc[(exception_condition) & (df['correction_factor_' + level] > 100), ['mean_' + level, 'lower_' + level, 'upper_' + level, ]] = np.nan return(df) # create df to test function df_test_capper = mean_capper(df, cf_exceptions) exception_condition = df.bundle_id.isin(cf_exceptions) # make boolean mask that says if a bundle is in the list df.loc[(~exception_condition)&(df.correction_factor_1 > 50), ['mean_1', 'lower_1', 'upper_1']] = [np.nan, np.nan, np.nan] # DOESN'T DO ANYTHING don't really need to apply here df.loc[(~exception_condition)&(df.correction_factor_2 > 50), ['mean_2', 'lower_2', 'upper_2']] = [np.nan, np.nan, np.nan] # DID DO SOMETHING, affects 6% of prev rows, 0.01% of inc rows df.loc[(~exception_condition)&(df.correction_factor_3 > 50), ['mean_3', 'lower_3', 'upper_3']] = [np.nan, np.nan, np.nan] # DOES A LOT, affects 57% percent of prevalence rows df.loc[(exception_condition)&(df.correction_factor_1 > 100), ['mean_1', 'lower_1', 'upper_1']] = [np.nan, np.nan, np.nan] df.loc[(exception_condition)&(df.correction_factor_2 > 100), ['mean_2', 'lower_2', 'upper_2']] = [np.nan, np.nan, np.nan] df.loc[(exception_condition)&(df.correction_factor_3 > 100), ['mean_3', 'lower_3', 'upper_3']] = [np.nan, np.nan, np.nan] ##################################################### # CHECK that lower < mean < upper ##################################################### # loop over every level of correction # can't compare null values, null comparisons always eval to False for i in ["0", "1", "2", "3", 'inj']: # lower < mean assert (df.loc[df['lower_'+i].notnull(), 'lower_'+i] <= df.loc[df["lower_"+i].notnull(), 'mean_'+i]).all(),\ "lower_{} should be less than mean_{}".format(i, i) # mean < lower assert (df.loc[df["upper_"+i].notnull(), 'mean_'+i] <= df.loc[df["upper_"+i].notnull(), 'upper_'+i]).all(),\ "mean_{} should be less than upper_{}".format(i, i) # compare the results between test df and proper df for uncertainty in ["mean", "upper", "lower"]: for level in ["1", "2", "3"]: # compare the sum of nulls rows between dfs assert df[uncertainty + "_" + level].isnull().sum() ==\ df_test_capper[uncertainty + "_" + level].isnull().sum(),\ "The new capping function is producing different results" # write the maternal denominator data, this is for the future when we work # in parallel if write_maternal_denom: def write_maternal_denom(df): mat_df = df[df.bundle_id==1010].copy() mat_df = mat_df.query("sex_id == 2 & age_start >=10 & age_end <=54") if mat_df.shape[0] == 0: return # NOTE sample size is dropped here, and we make a new one in the # following code mat_df = mat_df[['location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id', 'mean_0', 'mean_1', 'mean_2', 'mean_3']].copy() bounds = ['upper_0', 'upper_1', 'upper_2', 'upper_3', 'lower_0', 'lower_1', 'lower_2', 'lower_3'] for uncertainty in bounds: mat_df[uncertainty] = np.nan # PREP FOR POP #################################################### # we don't have years that we can merge on pop to yet, because # we aggregated to year bands; another problem with this method mat_df['year_id'] = mat_df.year_start + 2 # makes 2000,2005,2010 # bunch of age things so we can use age_group_id to get pop age_group = query("QUERY") # correct age groups age_group.loc[age_group['age_group_years_end'] > 1, 'age_group_years_end'] =\ age_group.loc[age_group['age_group_years_end'] > 1, 'age_group_years_end'] - 1 # df of unique ages from hospital data to merge onto age_group map mat_df_ages = pd.DataFrame([mat_df.age_start.unique(), mat_df.age_end.unique()]).transpose() mat_df_ages.columns = ['age_group_years_start', 'age_group_years_end'] mat_df_ages = mat_df_ages.merge(age_group, how='left', on=['age_group_years_start', 'age_group_years_end']) # this is the correct terminal age group (even though we use max # age = 99) mat_df_ages.loc[mat_df_ages['age_group_years_start'] == 95, 'age_group_id'] = 235 # there are two age_group_ids for age_start=0 and age_start=1 mat_df_ages = mat_df_ages[mat_df_ages.age_group_id != 161] # create age/year/location lists to use for pulling population age_list = list(mat_df_ages.age_group_id.unique()) loc_list = list(mat_df.location_id.unique()) year_list = list(mat_df.year_id.unique()) # GET POP ######################################################## # pull population and merge on age_start and age_end pop = get_population(age_group_id=age_list, location_id=loc_list, sex_id=[1, 2], year_id=year_list) # FORMAT POP #################################################### # attach age_start and age_end to population information pop = pop.merge(age_group, how='left', on='age_group_id') pop.drop(['process_version_map_id', 'age_group_id'], axis=1, inplace=True) # rename pop columns to match hospital data columns pop.rename(columns={'age_group_years_start': 'age_start', 'age_group_years_end': 'age_end'}, inplace=True) # correct terminal age group to match our data pop.loc[pop['age_end'] == 124, 'age_end'] = 99 # MERGE POP ###################################################### demography = ['location_id', 'year_id', 'age_start', 'age_end', 'sex_id'] pre_shape = mat_df.shape[0] # store for before comparison # then merge population onto the hospital data # attach pop info to hosp mat_df = mat_df.merge(pop, how='left', on=demography) assert pre_shape == mat_df.shape[0], ("number of rows don't " "match after merge") # MAKE SAMPLE SIZE ############################################## mat_df['sample_size'] = mat_df.population * mat_df.mean_0 # DROP intermidiate columns mat_df.drop(['population', 'year_id'], axis=1, inplace=True) mat_df.to_hdf("FILEPATH", key='df', mode="w") # backup copy to _archive mat_df.to_hdf("FILEPATH", key='df', mode='w') write_maternal_denom(df) if adjust_maternal_denom: def adjust_maternal_denom(df): # drop sample_size, UTLAs already had it, but we need it for # everything, so we have to drop it. df.drop('sample_size', axis=1, inplace=True) df = df.query("sex_id == 2 & age_start >=10 & age_end <=54") # read in maternal denoms, this is needed when our process is # parallelized denom = pd.read_hdf("FILEPATH", key="df") # denom.drop('bundle_id', axis=1, inplace=True) denom_cols = sorted(denom.filter(regex="[0-9]$").columns) for col in denom_cols: denom.rename(columns={col: col + "_denominator"}, inplace=True) pre = df.shape[0] df = df.merge(denom, how='left', on=['location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id']) assert pre == df.shape[0], ("shape should not have changed " "during merge") #print(df[df.mean_0_denominator.isnull()].shape) #print(df[df.mean_0_denominator.isnull()]) df = df[(df['mean_0'] > 0) | (df['mean_0_denominator'].notnull())] assert df.mean_0_denominator.isnull().sum() == 0, ("shouldn't be " "any null values in this column") # regex to find the columns that start with l, m or u and end with # a digit num_cols = sorted(df.filter(regex="^[lmu].*[0-9]$").columns) denom_cols =\ sorted(df.columns[df.columns.str.endswith("denominator")]) # divide each bundle value by bundle 1010 to get the adjusted rate for i in np.arange(0, 12, 1): df[num_cols[i]] = df[num_cols[i]] / df[denom_cols[i]] # drop the denominator columns df.drop(denom_cols, axis=1, inplace=True) # can't divide by zero df = df[df['sample_size'] != 0] # RETURN ONLY THE MATERNAL DATA return(df) df = adjust_maternal_denom(df) return(df)
def prepare_envelope(df, fix_maternal=False, return_only_inj=False, apply_age_sex_restrictions=True, want_to_drop_data=True): """ function that accepts as a parameter hospital dataframe and adds the the envelope to it, and adds meids and bundle_ids to it, duplicating baby sequela in the process (as expected) Parameters: df: DataFrame contains hospital inpatient primary diagnosis that has been mapped to Baby Sequelae fix_maternal: Boolean Should always be on, it's just an option for testing purposes to save time. switch that runs fix_maternal_denominators. This function transforms hospital data from 'population' to 'live births' using Age Specific Fertility Rates. return_only_inj: Boolean switch that will make it so this function only returns injuries data. should always be off, it's just there for testing. apply_age_sex_restrictions: Boolean switch that if True, will apply age and sex restrictions as determined by cause_set_id=9. DUSERts to true. Useful if you want to look at data all the data that was present in the source. """ ############################################### # DROP DATA ############################################### if want_to_drop_data: df = hosp_prep.drop_data(df, verbose=False) ############################################### # MAP AGE_GROUP_ID ONTO DATA ############################################### # These are our weird hospital age_group_id: we use 235 for 95-99, and we # don't use age_group_ids 2,3,4, we use 28 in their place ages = hosp_prep.get_hospital_age_groups( ) # df that has age_group_id, age_start, # and age_end df = df.merge(ages, how='left', on=['age_start', 'age_end']) # comment out to test the new sources in the process if not df.age_group_id.notnull().all(): warnings.warn("""Shouldn't be any null age_group_id, there could be unsplit ages in the data, or maybe an age_start/age_end changed""") # maps = pd.read_csv("FILEPATH") # maps = maps[['nonfatal_cause_name', 'cause_id', 'level']].copy() # maps = maps[maps.cause_id.notnull()] # maps = maps[maps.level == 1] # maps.drop("level", axis=1, inplace=True)# no need to merge level onto data # maps.drop_duplicates(inplace=True) # # pre = df.shape[0] # df = df.merge(maps, how='left', on='nonfatal_cause_name') # assert df.shape[0] == pre, "df shape changed during merge" ######################################################### # Get denominator for sources where we don't want to use envelope ######################################################### # Make list of sources that we don't want to use envelope full_coverage_sources = ["UK_HOSPITAL_STATISTICS"] if pd.Series(full_coverage_sources).isin(df.source.unique()).any(): # make dataframe with sources full_coverage_df = df[df.source.isin(full_coverage_sources)].copy() # get denominator (aka population) full_coverage_df = gbd_hosp_prep.get_sample_size(full_coverage_df) full_coverage_df.rename(columns={'population': 'sample_size'}, inplace=True) # NOTE now there's going be a column "sample_size" that # is null for every source except the fully covered ones # make product full_coverage_df[ 'product'] = full_coverage_df.val / full_coverage_df.sample_size else: # make empty dataframe so that it exists and wont break anything full_coverage_df = pd.DataFrame(columns=list(df.columns) + ['product']) # drop these sources from dataframe for now df = df[~df.source.isin(full_coverage_sources)].copy() ######################################### # CREATE CAUSE FRACTIONS ######################################### df = hosp_prep.create_cause_fraction(df) ############################################### # APPLY CAUSE RESTRICTIONS ############################################### # NOTE it is okay to apply corrections after cause fractions, because restrictions # were also applied before the cause fractions were made ( right before splitting) if apply_age_sex_restrictions: df = hosp_prep.apply_restrictions(df, 'cause_fraction') full_coverage_df = hosp_prep.apply_restrictions( full_coverage_df, 'product') # drop the restricted values df = df[df['cause_fraction'].notnull()] full_coverage_df = full_coverage_df[ full_coverage_df['product'].notnull()] ######################################### # APPLY ENVELOPE ######################################### # read envelope env_df = pd.read_csv("FILEPATH") locs = df.location_id.unique() # all locations in our data # find locations that are missing age end 99 missing_99 = list(set(locs) -\ set(env_df.loc[(env_df.location_id.isin(locs))&(env_df.age_end == 99), 'location_id'].unique())) # change their age_ends to be 99 env_df.loc[(env_df.location_id.isin(missing_99)) & (env_df.age_start == 95), 'age_end'] = 99 # since we do NOT want to merge on by age_group_id env_df.drop('age_group_id', axis=1, inplace=True) demography = [ 'location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id' ] # merge envelope onto data pre_shape = df.shape[0] df = df.merge(env_df, how='left', on=demography) assert pre_shape == df.shape[0],\ "The merge duplicated rows unexpectedly" # compute nfc hospitalization rate # aka "apply the envelope" df['product'] = df['cause_fraction'] * df['mean'] df['upper_product'] = df['cause_fraction'] * df['upper'] df['lower_product'] = df['cause_fraction'] * df['lower'] ############################################ # RE-ATTACH data that has sample size instead of ############################################ df = pd.concat([df, full_coverage_df]).reset_index(drop=True) # NOTE now there's going be a column "sample_size" and "cases" that # is null for every source except the fully covered ones # if we just want injuries: if return_only_inj == True: inj_me = pd.read_csv("FILEPATH") inj_me = list(inj_me['level1_meid']) df = df[df['modelable_entity_id'].isin(inj_me)] # drop columns we don't need anymore # cause fraction: finished using it, was used to make product # mean: finished using it, was used to make product # upper: finished using, it was used ot make product_upper # lower: finished using, it was used ot make product_lower # val, numerator, denomantor: no longer in count space # NON FATAL_CAUSE_NAME IS LEFT TO SIGNIFY THAT THIS DATA IS STILL AT THE # NONFATAL_CAUSE_NAME LEVEL df.drop([ 'cause_fraction', 'mean', 'upper', 'lower', 'val', 'numerator', 'denominator' ], axis=1, inplace=True) if "bundle_id" in df.columns: level_of_analysis = "bundle_id" if "nonfatal_cause_name" in df.columns: level_of_analysis = "nonfatal_cause_name" # write some summary stats of the env being applied # this is to review against our aggregated data to find large differences today = datetime.datetime.today().strftime("%Y_%m_%d").replace(":", "_") # Zeros can really skew the summary stats so drop them env_stats = df[df['product'] != 0].\ groupby(['location_id', level_of_analysis])['product'].\ describe().reset_index() env_stats.rename(columns={'level_2': 'stats'}, inplace=True) env_stats.to_csv("FILEPATH", index=False) return (df)