def apply_restricts(df, full_coverage_df): """ Apply the age-sex restrictions after expanding data to the bundle level """ df = hosp_prep.apply_bundle_restrictions(df, 'cause_fraction') full_coverage_df = hosp_prep.apply_bundle_restrictions( full_coverage_df, 'mean') # drop the restricted values df = df[df['cause_fraction'].notnull()] full_coverage_df =\ full_coverage_df[full_coverage_df['mean'].notnull()] return df, full_coverage_df
def make_square(df): """ Function that inserts zeros for demographic and etiolgy groups that were not present in the source. A zero is inserted for every age/sex/etiology combination for each location and for only the years available for that location. If a location has data from 2004-2005 we will create explicit zeroes for those years but not for any others. Age and sex restrictions are applied after the data is made square. Parameters: df: Pandas DataFrame Must be aggregated and collapsed to the bundle level. """ start_square = time.time() assert "bundle_id" in df.columns, "'bundle_id' must exist" assert "nonfatal_cause_name" not in df.columns, ( "Data must not be at the", " nonfatal_cause_name level") # create a series of sorted, non-zero mean values to make sure # the func doesn't alter anything check_mean = df.loc[df['mean_raw'] > 0, 'mean_raw'].sort_values().\ reset_index(drop=True) # square the dataset df = hosp_prep.make_zeros( df, etiology='bundle_id', cols_to_square=['mean_raw'], #, 'upper_raw', 'lower_raw'], icd_len=5) # assert the sorted means are identical assert (check_mean == df.loc[df['mean_raw'] > 0, 'mean_raw'].sort_values().\ reset_index(drop=True)).all() for col in ['mean_raw']: #, 'upper_raw', 'lower_raw']: df = hosp_prep.apply_bundle_restrictions(df, col) # assign upper and lower to 0 where mean is 0 df.loc[df['mean_raw'] == 0, ['upper_raw', 'lower_raw']] = 0 print("Data made square and age/sex restrictions applied in {} min".\ format((time.time() - start_square)/60)) return df
def apply_env_main(df, full_coverage_df, env_path, run_tmp_unc=False, write=False, read_cores=15): back = df.copy() starting_bundles = df.bundle_id.unique() # delete existing env*CF files and re-run them again if run_tmp_unc: delete_uncertainty_tmp() make_uncertainty_jobs(df) job_holder() fix_failed_jobs() # hold the script up until all the uncertainty jobs are done job_holder() # rough check to see if the length of new files is what we'd # expect if run_tmp_unc: check_len = df.age_group_id.unique().size *\ df.sex_id.unique().size *\ df.year_start.unique().size actual_len = len(glob.glob(r"FILEPATH/*.H5")) assert actual_len == check_len,\ "Check the error logs, it looks like something went wrong while re-writing files" print("Merging on the draw quantiles from the envelope * corrections") df = env_merger(df, read_cores=read_cores) # apply the hospital envelope print("Applying the uncertainty to cause fractions") df = apply_env(df) df = reattach_covered_data(df, full_coverage_df) df = drop_cols(df) # reapply age/sex restrictions df = hosp_prep.apply_bundle_restrictions(df, 'mean') df['bundle_id'] = df['bundle_id'].astype(float) int_cols = ['sex_id', 'location_id', 'year_start', 'year_end'] for col in int_cols: df[col] = df[col].astype(int) end_bundles = df.bundle_id.unique() diff = set(starting_bundles).symmetric_difference(set(end_bundles)) if len(diff) > 0: print("The difference in bundles is {}. Reprocessing these without correction"\ r" factor uncertainty.".format(diff)) redo_df = back[back.bundle_id.isin(diff)] del back redo_df = aeo.apply_envelope_only(df=redo_df, env_path=env_path, return_only_inj=False, apply_age_sex_restrictions=False, want_to_drop_data=False, create_hosp_denom=False, apply_env_subroutine=True, fix_norway_subnat=False) pre_cols = df.columns df = pd.concat([df, redo_df], ignore_index=True) # make sure columns keep the right dtype for col in [ 'sex_id', 'age_group_id', 'bundle_id', 'location_id', 'year_start', 'year_end', 'nid', 'representative_id' ]: df[col] = pd.to_numeric(df[col], errors='raise') post_cols = df.columns print("{} change in columns".format(set(pre_cols).\ symmetric_difference(post_cols))) if write: print("Writing the data to FILEPATH...") file_path = "FILEPATH/apply_env.H5" hosp_prep.write_hosp_file(df, file_path, backup=True, include_version_info=True) return df
def aggregate_to_bundle(df, write_maternal_denom=False, adjust_maternal_denom=False): """ Takes a dataframe aggregated to baby sequela level and returns the df aggregated to 5 year bands at the bundle level """ ############################## # PULL POPULATION # # This is preparation for aggregating to 5 year bands ############################## # get rid of all corrected values and correction factors. we want to do # this because we are going to aggregate to 5 year bands and the bundle_id # level of analysis. We intend to makes cells where the correction factor # is over 50 NULL. it doesn't make sense to do that before aggregating, # because any rows with nulls in them would be lost entirely. It doesn't # make sense to apply corrections and NOT delete cells that break the # over-50 rule before aggregation either cuz we would lose the correction # factors during the groupby. Also, it would not be robust to aggregate # a 16 columns. Therefore, it's best to apply corrections after aggregating. # Technically, the groupby would get rid of these for us. Honestly, this # just makes things easier to work with up to that point. df.drop(['mean_1', 'mean_2', 'mean_3', 'mean_inj', 'upper_1', 'upper_2', 'upper_3', 'upper_inj', 'lower_1', 'lower_2', 'lower_3', 'lower_inj', 'correction_factor_1', 'correction_factor_2', 'correction_factor_3', 'correction_factor_inj'], axis=1, inplace=True) if write_maternal_denom: df = df[df['bundle_id'] == 1010] # drop the non maternal data pronto if adjust_maternal_denom: # GET MATERNAL CAUSES causes = get_cause_metadata(cause_set_id=9) condition = causes.path_to_top_parent.str.contains("366") # 366 happens # to always be in the third level # subset just causes that meet the condition sdf maternal_causes = causes[condition] # make list of maternal causes maternal_list = list(maternal_causes['cause_id'].unique()) # get bundle to cause map bundle_cause = query("QUERY") # merge cause_id onto data df = df.merge(bundle_cause, how='left', on='bundle_id') # keep only maternal causes df = df[df['cause_id'].isin(maternal_list)] # drop cause_id df.drop('cause_id', axis=1, inplace=True) # drop the denominator bundle df = df[df['bundle_id'] != 1010] # pull age_group to age_start/age_end map ############################## # GET POPULATION ############################## age_group = query("QUERY") # correct age groups age_group.loc[age_group['age_group_years_end'] > 1, 'age_group_years_end']\ = age_group.\ loc[age_group['age_group_years_end'] > 1, 'age_group_years_end'] - 1 # df of unique ages from hospital data to merge onto age_group map df_ages = pd.DataFrame([df.age_start.unique(), df.age_end.unique()]).transpose() df_ages.columns = ['age_group_years_start', 'age_group_years_end'] df_ages = df_ages.merge(age_group, how='left', on=['age_group_years_start', 'age_group_years_end']) # this is the correct terminal age group (even though we use max age = 99) df_ages.loc[df_ages['age_group_years_start'] == 95, 'age_group_id'] = 235 # there are two age_group_ids for age_start=0 and age_start=1 df_ages = df_ages[df_ages.age_group_id != 161] # create age/year/location lists to use for pulling population age_list = list(df_ages.age_group_id.unique()) loc_list = list(df.location_id.unique()) year_list = list(df.year_start.unique()) # pull population and merge on age_start and age_end pop = get_population(age_group_id=age_list, location_id=loc_list, sex_id=[1, 2], year_id=year_list) # attach age_start and age_end to population information pop = pop.merge(age_group, how='left', on='age_group_id') pop.drop(['process_version_map_id', 'age_group_id'], axis=1, inplace=True) # rename pop columns to match hospital data columns pop.rename(columns={'age_group_years_start': 'age_start', 'age_group_years_end': 'age_end', 'year_id': 'year_start'}, inplace=True) pop['year_end'] = pop['year_start'] # correct terminal age group to match our data pop.loc[pop['age_end'] == 124, 'age_end'] = 99 demography = ['location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id'] ############################## # MAKE DATA SQUARE ############################## # create a series of sorted, non-zero mean values to make sure # the func doesn't alter anything check_mean = df.loc[df['mean_0'] > 0, 'mean_0'].sort_values().\ reset_index(drop=True) print("Starting number of rows: {}".format(df.shape[0])) # square the dataset df = hosp_prep.make_zeroes(df, level_of_analysis='bundle_id', cols_to_square=['mean_0', 'upper_0', 'lower_0'], icd_len=5) # assert the sorted means are identical assert (check_mean == df.loc[df['mean_0'] > 0, 'mean_0'].sort_values().\ reset_index(drop=True)).all() # delete rows where restrictions should be applied # create df where baby sequela are missing missing_nfc = df[df.nonfatal_cause_name.isnull()].copy() df = df[df.nonfatal_cause_name.notnull()] for col in ['mean_0', 'upper_0', 'lower_0']: df = hosp_prep.apply_restrictions(df, col) missing_nfc = hosp_prep.apply_bundle_restrictions(missing_nfc, col) df = df[df[col].notnull()] df = pd.concat([df, missing_nfc]) print("Square number of rows: {}".format(df.shape[0])) # don't create the parent injury dupes until after the data is totally # square so that the denominators will match # df = get_parent_injuries(df) # check_parent_injuries(df, 'mean_0') pre_shape = df.shape[0] # store for before comparison # then merge population onto the hospital data df = df.merge(pop, how='left', on=demography) # attach pop info to hosp assert pre_shape == df.shape[0], "number of rows don't match after merge" ############################## # RATE SPACE TO COUNTS ############################## # go from rate space to additive counts # because of careful merging we just need to multiply across columns df['hosp_count'] = df['mean_0'] * df['population'] df['upper_hosp_count'] = df['upper_0'] * df['population'] df['lower_hosp_count'] = df['lower_0'] * df['population'] # merge on "denominator" from file that was made back in # create_cause_fractions. This adds the denominator column, which is the # number of admissions in a demographic group df = df.merge(pd.read_csv("FILEPATH"), how='left', on=["age_start", "age_end", "sex_id", "year_start", "year_end", "location_id"]) # 5 year bins df = hosp_prep.year_binner(df) # add 5 year NIDs onto data df = hosp_prep.five_year_nids(df) ################################################################## ################################################################## # THE COLLAPSE # ################################################################## ################################################################## # The final collapse to 5 year bands, 5 year nids and bundle ID # this is doing a few things at once. One is that we need to aggregate # to 5 year bands. Another is aggregating to the Bundle_id level of # analysis. Up to this point we were at the nonfatal_cause_name AKA # baby sequelae level of analysis. # WE NEED TWO COLLAPSES. One for data that doesn't have a sample_size value, # and annother for the data that does. This is because: # https://goo.gl/e66OZ4 and https://goo.gl/Fb78xi # make df of data where there is full coverage (i.e., the UK) full_coverage_sources = ["UK_HOSPITAL_STATISTICS"] # make condition mask that indicates rows that have full coverage has_full_coverage = df.source.isin(full_coverage_sources) covered_df = df[has_full_coverage] # drop "denominator" from covered_df covered_df = covered_df.drop("denominator", axis=1) # drop this data from the main dataframe df = df[~has_full_coverage] assert (df.loc[df.denominator.isnull(), 'mean_0'] == 0).all(), ("mean_0" " should be 0") assert (df.loc[df.denominator.isnull(), 'lower_0'] == 0).all(), ("lower_0" " should be 0") assert (df.loc[df.denominator.isnull(), 'upper_0'] == 0).all(), ("upper_0" " should be 0") df = df[df.denominator.notnull()] # df already has sample size df.drop("sample_size", axis=1, inplace=True) # check if cases are lost in the groupby # rename "denominator" to "sample_size" in df (not covered_df) df.rename(columns={"denominator": "sample_size"}, inplace=True) pre_cases = df['hosp_count'].sum() # can use the same group columns for both dataframes groups = ['location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id', 'nid', 'representative_id', 'bundle_id'] # sample_size has some null values from being made square, but it was just # population, so we're using pop instead. so remember, # population == sample_size for covered_df covered_df = covered_df.groupby(groups)\ .agg({'hosp_count':'sum', 'population': 'sum'}).reset_index() # add "sample_size" to the aggregate function df = df.groupby(groups).agg({'hosp_count': 'sum', 'upper_hosp_count': 'sum', 'lower_hosp_count': 'sum', 'population': 'sum', 'sample_size': 'sum'}).reset_index() assert round(pre_cases, 0) == round(df['hosp_count'].sum(), 0),\ ("some cases were lost. " "From {} to {}".format(pre_cases, df['hosp_count'].sum())) # set sample size to np.nan when mean/upper/lower are greater than 0 df.loc[(df['hosp_count'] > 0) & (df['lower_hosp_count'] > 0) & (df['upper_hosp_count'] > 0), 'sample_size'] = np.nan ############################## # COUNTS TO RATE SPACE ############################## # REMAKE mean and uncertainty # for the main df: df['mean_0'] = df['hosp_count'] / df['population'] df['lower_0'] = df['lower_hosp_count'] / df['population'] df['upper_0'] = df['upper_hosp_count'] / df['population'] df.drop(['hosp_count', 'lower_hosp_count', 'upper_hosp_count'], axis=1, inplace=True) # add parent injuries # NOTE get parent injuries is ran before covered_df is concated with df. # it happens to not make a difference at the moment, because there are not # injuries in covered_df, but it could in the future. df = get_parent_injuries(df) for col in ['mean_0', 'lower_0', 'upper_0']: hosp_prep.check_parent_injuries(df, col_to_sum=col) # this drops the population that was merged on for coverting to counts. df.drop('population', axis=1, inplace=True) # don't need pop anymore # for the covered df: covered_df['mean_0'] = covered_df['hosp_count'] / covered_df['population'] covered_df.rename(columns={"population": "sample_size"}, inplace=True) # drop columns covered_df.drop(['hosp_count'], axis=1, inplace=True) ############################### # RE-ATTACH ############################### # bring covered_df and df together. # where we have full coverage, lower and upper should be null # mean_0 will never be null df = pd.concat([df, covered_df], ignore_index=True) # assert what we just said will be true in the comments above: assert df.loc[has_full_coverage, 'lower_0'].isnull().all(), ("where we have" " full coverage, lower_0 should be null") assert df.loc[has_full_coverage, 'upper_0'].isnull().all(), ("where we have" " full coverage, upper_0 should be null") assert df.mean_0.notnull().all(), ("mean_0 should never be null") # NOTE, remember, sample size will still have null values, and that's okay # we need to keep sample size from here on out. if "population" in df.columns: print "population was still in columns" df.drop("population", axis=1, inplace=True) ######################################## # map measure onto data, just need this for ELMO reqs. clean_maps = pd.read_csv("FILEPATH") clean_maps = clean_maps[['bundle_id', 'bid_measure']] clean_maps.drop_duplicates(inplace=True) clean_maps.rename(columns={'bid_measure': 'measure'}, inplace=True) # remove null bundles from map clean_maps = clean_maps[clean_maps.bundle_id.notnull()] pre_shape = df.shape[0] # store for comparison after merge # merge measure onto hosp data using bundle_id df = df.merge(clean_maps, how='left', on='bundle_id') assert pre_shape == df.shape[0], "number of rows don't match after merge." # get injuries bids so we can check for missing measures pc_injuries = pd.read_csv("FILEPATH") inj_bids = pc_injuries['Level1-Bundle ID'].unique() # some injuries bids didn't get measures! assert set(df[df.measure.isnull()].bundle_id).issubset(set(inj_bids)),\ ("We expect that all null measures belong to injuries, but that is" "not the case. Something went wrong!") # fix any injuries that are missing measure, all inj are inc: df.loc[(df.measure.isnull())&(df.bundle_id.isin(inj_bids)), 'measure'] = 'inc' assert df.measure.isnull().sum() == 0, ("There are null values and we " "expect none") # read in correction factors (again) correction_factors = pd.read_csv("FILEPATH") correction_factors.drop("outpatient", axis=1, inplace=True) # rename columns to match df correction_factors.rename(columns={'sex': 'sex_id'}, inplace=True) # merge corr factors onto data df = df.merge(correction_factors, how='left', on=['age_start', 'sex_id', 'bundle_id']) assert pre_shape == df.shape[0] , ("You unexpectedly added rows while " "merging on the correction factors. Don't do that!") # if a Bundle ID doesn't have a corr factor from marketscan use 1 # df.update(df[['a','b','c']].fillna(0)) # test code # http://stackoverflow.com/questions/36556256/how-do-i-fill-na-values-in-multiple-columns-in-pandas df.update(df[['indv_cf', 'incidence', 'prevalence', 'injury_cf']].fillna(1)) # rename correction factors to match what we told people they would be df.rename(columns={'indv_cf': 'correction_factor_1', 'incidence': 'correction_factor_2', 'prevalence': 'correction_factor_3', 'injury_cf': 'correction_factor_inj'}, inplace=True) # NOTE we apply every correction factor to all data, even if it is not # relevant. E.g., not all data is injuries, so not all data needs # correction_factor_inj. It simply easier to apply all of them, and then # while writing to modeler's folders, drop the irrelevant columns. # make mean_1, lower_1, upper_1 df['mean_1'] = df.correction_factor_1 * df.mean_0 df['lower_1'] = df.correction_factor_1 * df.lower_0 df['upper_1'] = df.correction_factor_1 * df.upper_0 # make mean_2, lower_2, upper_2 df['mean_2'] = df.correction_factor_2 * df.mean_0 df['lower_2'] = df.correction_factor_2 * df.lower_0 df['upper_2'] = df.correction_factor_2 * df.upper_0 # make mean_3, lower_3, upper_3 df['mean_3'] = df.correction_factor_3 * df.mean_0 df['lower_3'] = df.correction_factor_3 * df.lower_0 df['upper_3'] = df.correction_factor_3 * df.upper_0 # make injury mean, lower, upper df['mean_inj'] = df.correction_factor_inj * df.mean_0 df['lower_inj'] = df.correction_factor_inj * df.lower_0 df['upper_inj'] = df.correction_factor_inj * df.upper_0 # assert what we just said will be true in the comments above: levels=["1", "2", "3", "inj"] for level in levels: assert df.loc[has_full_coverage, 'lower_{}'.format(level)].isnull().all(), ("broke on level {}".format(level)) assert df.loc[has_full_coverage, 'upper_{}'.format(level)].isnull().all(), ("broke on level {}".format(level)) assert df["mean_{}".format(level)].notnull().all(), ("broke on level {}".format(level)) def factor_applier(df, levels=["1", "2", "3", "inj"]): for level in levels: # initialize cols df['test_mean_' + level] = np.nan df['test_lower_' + level] = np.nan df['test_upper_' + level] = np.nan # apply corr factors df['test_mean_' + level], df['test_lower_' + level], df['test_upper_' + level] = df["correction_factor_" + level] * df['mean_0'], df["correction_factor_" + level] * df['lower_0'], df["correction_factor_" + level] * df['upper_0'] return(df) df = factor_applier(df) levels=["1", "2", "3", "inj"] for level in levels: assert (df.loc[df["mean_" + level].notnull(), "mean_" + level] == df.loc[df["test_mean_" + level].notnull(), "test_mean_" + level]).all(), ("different on level {}".format(level)) assert (df.loc[df["upper_" + level].notnull(), "upper_" + level] == df.loc[df["test_upper_" + level].notnull(), "test_upper_" + level]).all(), ("different on level {}".format(level)) assert (df.loc[df["lower_" + level].notnull(), "lower_" + level] == df.loc[df["test_lower_" + level].notnull(), "test_lower_" + level]).all(), ("different on level {}".format(level)) # drop test cols for now until we run this for awhile without # tripping the assert test_cols = df.columns[df.columns.str.startswith("test_")] df.drop(test_cols, axis=1, inplace=True) # RULE = if correction factor is greater than 50, make the data null # EXCEPTIONS are made for these bundles, which are capped at 100: # Preterm: 80, 81, 82, 500 # Encephalopathy: 338 # Sepsis: 92 # Hemoloytic: 458 # PAD/PUD: 345 # Cirrhosis: 131 # list of bundles which can have correction factors above 50 cf_exceptions = [345, 80, 81, 82, 500, 338, 92, 458, 131] # NOTE when checking the number of nulls, consider the nulls that are caused # by the sample_size split def mean_capper(df, exceptions, levels=["1", "2", "3"]): exception_condition = df.bundle_id.isin(cf_exceptions) for level in levels: df.loc[(~exception_condition) & (df['correction_factor_' + level] > 50), ['mean_' + level, 'lower_' + level, 'upper_' + level, ]] = np.nan df.loc[(exception_condition) & (df['correction_factor_' + level] > 100), ['mean_' + level, 'lower_' + level, 'upper_' + level, ]] = np.nan return(df) # create df to test function df_test_capper = mean_capper(df, cf_exceptions) exception_condition = df.bundle_id.isin(cf_exceptions) # make boolean mask that says if a bundle is in the list df.loc[(~exception_condition)&(df.correction_factor_1 > 50), ['mean_1', 'lower_1', 'upper_1']] = [np.nan, np.nan, np.nan] # DOESN'T DO ANYTHING don't really need to apply here df.loc[(~exception_condition)&(df.correction_factor_2 > 50), ['mean_2', 'lower_2', 'upper_2']] = [np.nan, np.nan, np.nan] # DID DO SOMETHING, affects 6% of prev rows, 0.01% of inc rows df.loc[(~exception_condition)&(df.correction_factor_3 > 50), ['mean_3', 'lower_3', 'upper_3']] = [np.nan, np.nan, np.nan] # DOES A LOT, affects 57% percent of prevalence rows df.loc[(exception_condition)&(df.correction_factor_1 > 100), ['mean_1', 'lower_1', 'upper_1']] = [np.nan, np.nan, np.nan] df.loc[(exception_condition)&(df.correction_factor_2 > 100), ['mean_2', 'lower_2', 'upper_2']] = [np.nan, np.nan, np.nan] df.loc[(exception_condition)&(df.correction_factor_3 > 100), ['mean_3', 'lower_3', 'upper_3']] = [np.nan, np.nan, np.nan] ##################################################### # CHECK that lower < mean < upper ##################################################### # loop over every level of correction # can't compare null values, null comparisons always eval to False for i in ["0", "1", "2", "3", 'inj']: # lower < mean assert (df.loc[df['lower_'+i].notnull(), 'lower_'+i] <= df.loc[df["lower_"+i].notnull(), 'mean_'+i]).all(),\ "lower_{} should be less than mean_{}".format(i, i) # mean < lower assert (df.loc[df["upper_"+i].notnull(), 'mean_'+i] <= df.loc[df["upper_"+i].notnull(), 'upper_'+i]).all(),\ "mean_{} should be less than upper_{}".format(i, i) # compare the results between test df and proper df for uncertainty in ["mean", "upper", "lower"]: for level in ["1", "2", "3"]: # compare the sum of nulls rows between dfs assert df[uncertainty + "_" + level].isnull().sum() ==\ df_test_capper[uncertainty + "_" + level].isnull().sum(),\ "The new capping function is producing different results" # write the maternal denominator data, this is for the future when we work # in parallel if write_maternal_denom: def write_maternal_denom(df): mat_df = df[df.bundle_id==1010].copy() mat_df = mat_df.query("sex_id == 2 & age_start >=10 & age_end <=54") if mat_df.shape[0] == 0: return # NOTE sample size is dropped here, and we make a new one in the # following code mat_df = mat_df[['location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id', 'mean_0', 'mean_1', 'mean_2', 'mean_3']].copy() bounds = ['upper_0', 'upper_1', 'upper_2', 'upper_3', 'lower_0', 'lower_1', 'lower_2', 'lower_3'] for uncertainty in bounds: mat_df[uncertainty] = np.nan # PREP FOR POP #################################################### # we don't have years that we can merge on pop to yet, because # we aggregated to year bands; another problem with this method mat_df['year_id'] = mat_df.year_start + 2 # makes 2000,2005,2010 # bunch of age things so we can use age_group_id to get pop age_group = query("QUERY") # correct age groups age_group.loc[age_group['age_group_years_end'] > 1, 'age_group_years_end'] =\ age_group.loc[age_group['age_group_years_end'] > 1, 'age_group_years_end'] - 1 # df of unique ages from hospital data to merge onto age_group map mat_df_ages = pd.DataFrame([mat_df.age_start.unique(), mat_df.age_end.unique()]).transpose() mat_df_ages.columns = ['age_group_years_start', 'age_group_years_end'] mat_df_ages = mat_df_ages.merge(age_group, how='left', on=['age_group_years_start', 'age_group_years_end']) # this is the correct terminal age group (even though we use max # age = 99) mat_df_ages.loc[mat_df_ages['age_group_years_start'] == 95, 'age_group_id'] = 235 # there are two age_group_ids for age_start=0 and age_start=1 mat_df_ages = mat_df_ages[mat_df_ages.age_group_id != 161] # create age/year/location lists to use for pulling population age_list = list(mat_df_ages.age_group_id.unique()) loc_list = list(mat_df.location_id.unique()) year_list = list(mat_df.year_id.unique()) # GET POP ######################################################## # pull population and merge on age_start and age_end pop = get_population(age_group_id=age_list, location_id=loc_list, sex_id=[1, 2], year_id=year_list) # FORMAT POP #################################################### # attach age_start and age_end to population information pop = pop.merge(age_group, how='left', on='age_group_id') pop.drop(['process_version_map_id', 'age_group_id'], axis=1, inplace=True) # rename pop columns to match hospital data columns pop.rename(columns={'age_group_years_start': 'age_start', 'age_group_years_end': 'age_end'}, inplace=True) # correct terminal age group to match our data pop.loc[pop['age_end'] == 124, 'age_end'] = 99 # MERGE POP ###################################################### demography = ['location_id', 'year_id', 'age_start', 'age_end', 'sex_id'] pre_shape = mat_df.shape[0] # store for before comparison # then merge population onto the hospital data # attach pop info to hosp mat_df = mat_df.merge(pop, how='left', on=demography) assert pre_shape == mat_df.shape[0], ("number of rows don't " "match after merge") # MAKE SAMPLE SIZE ############################################## mat_df['sample_size'] = mat_df.population * mat_df.mean_0 # DROP intermidiate columns mat_df.drop(['population', 'year_id'], axis=1, inplace=True) mat_df.to_hdf("FILEPATH", key='df', mode="w") # backup copy to _archive mat_df.to_hdf("FILEPATH", key='df', mode='w') write_maternal_denom(df) if adjust_maternal_denom: def adjust_maternal_denom(df): # drop sample_size, UTLAs already had it, but we need it for # everything, so we have to drop it. df.drop('sample_size', axis=1, inplace=True) df = df.query("sex_id == 2 & age_start >=10 & age_end <=54") # read in maternal denoms, this is needed when our process is # parallelized denom = pd.read_hdf("FILEPATH", key="df") # denom.drop('bundle_id', axis=1, inplace=True) denom_cols = sorted(denom.filter(regex="[0-9]$").columns) for col in denom_cols: denom.rename(columns={col: col + "_denominator"}, inplace=True) pre = df.shape[0] df = df.merge(denom, how='left', on=['location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex_id']) assert pre == df.shape[0], ("shape should not have changed " "during merge") #print(df[df.mean_0_denominator.isnull()].shape) #print(df[df.mean_0_denominator.isnull()]) df = df[(df['mean_0'] > 0) | (df['mean_0_denominator'].notnull())] assert df.mean_0_denominator.isnull().sum() == 0, ("shouldn't be " "any null values in this column") # regex to find the columns that start with l, m or u and end with # a digit num_cols = sorted(df.filter(regex="^[lmu].*[0-9]$").columns) denom_cols =\ sorted(df.columns[df.columns.str.endswith("denominator")]) # divide each bundle value by bundle 1010 to get the adjusted rate for i in np.arange(0, 12, 1): df[num_cols[i]] = df[num_cols[i]] / df[denom_cols[i]] # drop the denominator columns df.drop(denom_cols, axis=1, inplace=True) # can't divide by zero df = df[df['sample_size'] != 0] # RETURN ONLY THE MATERNAL DATA return(df) df = adjust_maternal_denom(df) return(df)