def save_birth_count_estimates(gbd_round_id: int, decomp_step: str, cov_estimate_filepath: pathlib.PosixPath, location_set_id: int, most_detailed_locs: Set[int]) -> None: """ we need to pull covariate estimates for each unique location_id, that's where save_birth_count_estimates comes in """ index_cols = ['location_id', 'year_id', 'age_group_id', 'sex_id'] df = get_covariate_estimates( mmr_constants.LIVE_BIRTHS_COVARIATE_ID, decomp_step=decomp_step, gbd_round_id=gbd_round_id, year_id=mmr_constants.OUTPUT_YEARS, age_group_id=mmr_constants.ALL_MOST_DETAILED_AGE_GROUP_IDS, location_set_id=location_set_id) # Because covariate is tagged to sex of baby but MMR numerator is only # females, we want aggregate sex and assign to female sex_id df.loc[:, 'sex_id'] = 2 df = df.loc[:, index_cols + [mmr_constants.Columns.LIVE_BIRTH_VALUE_COL]].groupby( index_cols).sum().reset_index() df = _filter_to_most_detailed_locs(df, most_detailed_locs) df.to_csv(cov_estimate_filepath, index=False)
def _get_covariates(self): cov = get_covariate_estimates(covariate_id=881, gbd_round_id=self._gbd_round_id, location_id=self.location_ids, year_id=self.year_id) cov.rename(columns={'mean_value': 'sdi'}, inplace=True) return cov[['location_id', 'sdi']].reset_index(drop=True)
def get_live_births_summaries(location_ids, year_ids): # best model_version_id at time of upload - 24083 lvbrth_cov_id = 1106 # live births by sex covariate births = get_covariate_estimates(covariate_id=lvbrth_cov_id, gbd_round_id=5, location_id=location_ids, year_id=year_ids, sex_id=[3, 2, 1]) births = births[['location_id', 'year_id', 'sex_id', 'mean_value']] # Currently, the live births by sex covariate returns most_detailed # sex but we need both sexes combined. The data needed to aggregate sex are # contained in the returned dataframe. # Use that info to aggregate sex for this data adjustment both_sexes = births.copy() both_sexes.loc[:, 'sex_id'] = 3 both_sexes = both_sexes.groupby(['location_id', 'year_id', 'sex_id']).sum().reset_index() births = pd.merge(births, both_sexes[['location_id', 'year_id', 'mean_value']], how='left', on=['location_id', 'year_id'], suffixes=['', '_both'], indicator=True) assert (births._merge == 'both').all() births.drop('_merge', axis=1, inplace=True) births['birth_prop'] = births['mean_value'] / births['mean_value_both'] # merge in 'sex' column because data in eurocat spreadsheet does not have # sex_ids sex_meta = get_ids('sex') births = pd.merge(births, sex_meta, on='sex_id') return births
def add_covariate(df, covariate_id, covariate_column_name, by_sex=True): assert 'location_id' not in df.columns, "Unexpected df structure: has location_id" assert 'country' in df.columns, "Unexpected df structure: lacks country" merge_cols = ['country', 'year_id', 'sex_id'] cov_df = None if not by_sex: merge_cols.remove('sex_id') # For covariates passed to me via flat files (aka .csvs), I manually # assign them negative covariate values. Handle them as a separate case if (covariate_id < 0): cov_df, merge_cols = get_flat_covariate_estimates(covariate_id) else: cov_df = db_queries.get_covariate_estimates(covariate_id, decomp_step="step1") cov_df = cov_df.rename(columns={ 'location_id': 'country', 'mean_value': covariate_column_name }) cov_df = cov_df[merge_cols + [covariate_column_name]] report_duplicates(cov_df, merge_cols) df = df.merge(cov_df, on=merge_cols, how='left') # As of 2/7/19, flat files go from 1990 - 2017, so missing # 80s throws an error. I deal with this in regression setup if (covariate_id > 0): report_if_merge_fail(df, covariate_column_name, merge_cols) return df
def pull_asfr(self): logger.info("Pulling ASFR...") asfr_df = get_covariate_estimates(covariate_id=13) asfr_df = asfr_df.ix[(asfr_df.age_group_id.isin(self.age_group_ids)) & (asfr_df.year_id.isin(self.year_id)) & (asfr_df.sex_id.isin(self.sex_id))] return asfr_df[self.index_cols + ['mean_value']]
def pull_reshape_tfr(gbd_round_id, tfr_version, location_ids): """Pulls year 2017 GBD round 5 TFR, converts it an xarray dataarray, pulls forecast TFR, and concatenates the dataarrays. The new array is then converted to a pandas dataframe. All required data are then reshaped and merged for downstream table production. Args: gbd_round_id (int): GBD round. tfr_version (str): Forecast TFR version. location_ids (list): List of location IDs to pull from both past and future data. Returns: tfr_final_df (pandas dataframe): Dataframe with all required TFR data, reshaped for downstream table production. """ p_end = YEARS.past_end f_end = YEARS.forecast_end # Get 2017 GBD TFR tfr_2017 = get_covariate_estimates(covariate_id=149, gbd_round_id=gbd_round_id, location_id=location_ids, year_id=p_end, status="best")[[ "year_id", "location_id","mean_value", "lower_value", "upper_value" ]].rename(columns={"mean_value":"mean", "lower_value":"lower", "upper_value":"upper"}) tfr_2017_da = melt_to_xarray(tfr_2017) # Get future TFR tfr_fut = open_xr(f"{gbd_round_id}/future/tfr/" f"{tfr_version}/tfr_combined.nc").data tfr_fut_sel = tfr_fut.sel(location_id=location_ids, scenario=SCENARIOS, year_id=YEARS.forecast_years) # Concat and make quantile wide tfr_da = xr.concat([tfr_2017_da, tfr_fut_sel], dim="year_id") tfr_df = tfr_da.to_dataframe().reset_index() tfr_df = tfr_df.pivot_table(values="value", index=["location_id", "year_id", "scenario"], columns="quantile").reset_index() # Combine value and UI into one column tfr_df = combine_mean_ui(tfr_df, df_type="tfr") # Get 2017 and 2100 values tfr2017 = tfr_df.query(f"year_id == {p_end} and scenario==0") tfr2100 = tfr_df.query(f"year_id == {f_end}") tfr2017 = pivot_scenarios(tfr2017, f"{p_end}", SCENARIO_MAP, df_type="tfr") tfr2100 = pivot_scenarios(tfr2100, f"{f_end}", SCENARIO_MAP, df_type="tfr") # Merge tfr_final_df = tfr2017.merge(tfr2100) return tfr_final_df
def get_5_year_haqi_cf(gbd_round_id, decomp_step, min_treat=0.1, max_treat=0.75): """ A function to get the health access quality covariates data which we'll use to divide our mean_raw values by to adjust our estimates up Parameters: min_treat: float minimum access. Sets a floor for the CF. If 0.1 then the lowest possible CF will be 0.1, in practice this is a 10x increase in the estimate max_treat: float or int maximum acess. Sets a cap for the CF. If 75 then any loc/year with a covariate above 75 will have a CF of 1 and the data will be unchanged """ # get a dataframe of haqi covariate estimates df = get_covariate_estimates(covariate_id=1099, gbd_round_id=gbd_round_id, decomp_step=decomp_step) df.rename(columns={'year_id': 'year_start'}, inplace=True) if df.mean_value.mean() > 1 and max_treat < 1: warn_msg = """Increasing max_treat variable 100X. Mean of the HAQi column is larger than 1. We assume this means the range is from 0 to 100. Summary stats for the mean_value column in the haqi covar are \n {}""".format( df.mean_value.describe()) warnings.warn(warn_msg) max_treat = max_treat * 100 # set the max value df.loc[df.mean_value > max_treat, 'mean_value'] = max_treat # get min df present in the data # Note, should this just be 0.1? min_df = df.mean_value.min() # make the correction df['haqi_cf'] = \ min_treat + (1 - min_treat) * ((df['mean_value'] - min_df) / (max_treat - min_df)) # drop the years outside of hosp_data so year binner doesn't break df['year_end'] = df['year_start'] warnings.warn("Currently dropping HAQi values before 1988 and after 2017") df = df[(df.year_start > 1987) & (df.year_start < 2018)].copy() df = hosp_prep.year_binner(df) # Take the average of each 5 year band df = df.groupby(['location_id', 'year_start', 'year_end']).agg({ 'haqi_cf': 'mean' }).reset_index() assert df.haqi_cf.max() <= 1, "The largest haqi CF is too big" assert df.haqi_cf.min() >= min_treat, "The smallest haqi CF is too small" return df
def get_births(cfr_df): births_df = get_covariate_estimates(covariate_id = 1106, sex_id = [1,2], gbd_round_id = 5) births_df = births_df.drop(['model_version_id', 'covariate_id', 'covariate_name_short', 'location_name', 'age_group_id', 'age_group_name', 'lower_value', 'upper_value'], axis = 1) births_df = births_df.rename(index=str, columns={"year_id": "year", "mean_value": "births"}) births_df = births_df.loc[births_df['year'].isin([1990, 1995, 2000, 2005, 2010, 2017])] births_df = births_df.loc[births_df['location_id'].isin(cfr_df['location_id'].unique())] births_df = births_df.rename(columns = {'sex_id': 'sex'}) births_df = births_df.set_index(col_list) births_df = births_df.sortlevel() return births_df
def adjust_pms(self): '''Adjusts PMS (Pre-menstrual Syndrome) cases for pregnancy prevalence and incidence ''' pms_key = self.me_map["pms"]["srcs"]["tot"] adj_key = self.me_map["pms"]["trgs"]["adj"] covariate_index_dimensions = [ 'age_group_id', 'location_id', 'sex_id', 'year_id' ] pms_df = self.me_dict[pms_key] asfr_df = get_covariate_estimates(13, decomp_step=self.decomp_step) sbr_df = get_covariate_estimates(2267, decomp_step=self.decomp_step) asfr_df = asfr_df.filter(covariate_index_dimensions + ['mean_value']) # Assumes the still birth rates covariate is reported for all_ages # and both sexes. Hence we ignore that and merge only on loc and year sbr_df = sbr_df.filter(['location_id', 'year_id', 'mean_value']) asfr_df.rename(columns={'mean_value': 'asfr_mean'}, inplace=True) sbr_df.rename(columns={'mean_value': 'sbr_mean'}, inplace=True) prop_df = asfr_df.merge(sbr_df, how='inner', on=['location_id', 'year_id']) prop_df[prop_df.sex_id == 1].sbr_mean = 0 prop_df['prop'] = (prop_df.asfr_mean + (prop_df.asfr_mean * prop_df.sbr_mean)) * 46 / 52 prop_df.set_index(covariate_index_dimensions, inplace=True) adj_df = pms_df.copy() adj_df.reset_index(level='measure_id', inplace=True) adj_df = adj_df.merge(prop_df.prop, how='inner', left_index=True, right_index=True) for col in self.draw_cols: adj_df[col] = adj_df[col] * (1 - adj_df.prop) adj_df.drop(columns='prop', inplace=True) self.me_dict[adj_key] = adj_df
def get_live_births_summaries(self): lvbrth_cov_id = 1106 #live births by sex births = get_covariate_estimates( covariate_id=lvbrth_cov_id, gbd_round_id=self.gbdr, location_id=self.dim_births['location_id'], year_id=self.dim_births['year_id'], sex_id=self.dim_births['sex_id']) births = births[['location_id', 'year_id', 'sex_id', 'mean_value']] births.rename(columns={'mean_value': 'births'}, inplace=True) self.lv_bir_frame = births
def run_shared_funcs(mat): """ get all the central inputs we'll need. Population and asfr and ifd covariates """ years = list(np.arange(1988, 2018, 1)) locs = mat.location_id.unique().tolist() ages = mat.age_group_id.unique().tolist() # get pop pop = get_population(age_group_id=ages, location_id=locs, year_id=years, sex_id=[2]) # GET ASFR and IFD # has age/location/year asfr = get_covariate_estimates(covariate_id=13, location_id=locs, age_group_id=ages, year_id=years) ifd = get_covariate_estimates(covariate_id=51) return pop, asfr, ifd
def get_gbd_covariate_estimates( covariate_id: int, covariate_name_short: str, location_set_id: int, year_ids: List[int], gbd_round_id: int, decomp_step: str ) -> pd.DataFrame: """ Pulls estimates for a single GBD covariate. Args: covariate_id: the covariate ID for which to pull estimates covariate_name_short: the short name of the covariate location_set_id: the location set ID for which to pull estimates year_ids: the year IDs for which to pull estimates gbd_round_id: the GBD round ID for which to pull estimates decomp_step: the decomp step for which to pull estimates Returns: Dataframe of demographic information and covariate estimates Raises: ValueError: if covariate does not have best values for a given GBD round and decomp step """ logging.info(f'Pulling covariate {covariate_name_short}') covariate_df = ( db_queries.get_covariate_estimates( covariate_id, location_set_id=location_set_id, year_id=year_ids, gbd_round_id=gbd_round_id, decomp_step=( None if gbd_round_id < 6 else decomp_step ) ) .rename(columns={columns.MEAN_VALUE: covariate_name_short}) [columns.DEMOGRAPHICS + [covariate_name_short]]) if covariate_df.empty: raise ValueError( f'No best values for covariate {covariate_name_short} for ' f'gbd round ID {gbd_round_id}, decomp step {decomp_step}' ) return covariate_df
def smoking(hierarchy: pd.DataFrame) -> pd.DataFrame: data = db_queries.get_covariate_estimates( gbd_round_id=6, decomp_step='iterative', covariate_id=282, # Smoking Prevalence (Age-standardized, both sexes) year_id=2019, ) data = (data.loc[:, ['location_id', 'mean_value']].rename( columns={ 'mean_value': 'smoking' }).set_index('location_id').sort_index()) # pass down hierarchy data = parent_inheritance(data, hierarchy) data = data.squeeze() return data
def uhc(hierarchy: pd.DataFrame) -> pd.DataFrame: data = db_queries.get_covariate_estimates( gbd_round_id=6, decomp_step='iterative', covariate_id=1097, # Universal health coverage year_id=2019, ) data = (data.loc[:, ['location_id', 'mean_value']].rename(columns={ 'mean_value': 'uhc' }).set_index('location_id').sort_index()) # pass down hierarchy data = parent_inheritance(data, hierarchy) data = data.squeeze() return data
def haq(hierarchy: pd.DataFrame) -> pd.DataFrame: data = db_queries.get_covariate_estimates( gbd_round_id=6, decomp_step='iterative', covariate_id=1099, # Healthcare access and quality index year_id=2019, ) data = (data.loc[:, ['location_id', 'mean_value']].rename(columns={ 'mean_value': 'haq' }).set_index('location_id').sort_index()) # pass down hierarchy data = parent_inheritance(data, hierarchy) data = data.squeeze() return data
def obesity(hierarchy: pd.DataFrame) -> pd.DataFrame: data = db_queries.get_covariate_estimates( gbd_round_id=6, decomp_step='iterative', covariate_id=455, # Prevalence of obesity (age-standardized) year_id=2019, ) # just averaging sexes here... data = (data.groupby('location_id')['mean_value'].mean().rename( 'obesity').to_frame()) # pass down hierarchy data = parent_inheritance(data, hierarchy) data = data.squeeze() return data
def get_asfr(self): '''Pulls the age-specific fertility rate, which is used in live birth calculation''' asfr_id = 13 asfr = get_covariate_estimates(covariate_id=asfr_id, location_id=self.most_detailed_locs, sex_id=2, age_group_id=self.most_detailed_ages, year_id=self.year_id, status='best') asfr.rename(columns={'mean_value': 'asfr'}, inplace=True) keeps = ['location_id', 'year_id', 'age_group_id', 'sex_id', 'asfr'] asfr = asfr[keeps] '''pull the most detailed age groups and set asfr for age groups outside of our maternal age range to zero.''' asfr.loc[~asfr.age_group_id.isin(range(7, 16)), 'asfr'] = 0. return asfr
def get_asfr(self): '''Pulls the age-specific fertility rate, which is used in live birth calculation''' asfr_id = 13 asfr = get_covariate_estimates(covariate_id=asfr_id, location_id=self.most_detailed_locs, sex_id=2, age_group_id=self.most_detailed_ages, year_id=self.year_id, status='best', decomp_step=self.decomp_step) asfr.rename(columns={'mean_value': 'asfr'}, inplace=True) keeps = ['location_id', 'year_id', 'age_group_id', 'sex_id', 'asfr'] asfr = asfr[keeps] '''maternal age range is 10 to 54 years old. This corresponds to age_group_ids 7 to 15 (inclusive). Set all age groups outside of the maternal age range to zero ''' asfr.loc[~asfr.age_group_id.isin(list(range(7, 16))), 'asfr'] = 0. return asfr
def get_asfr(df, decomp_step): """ The Stata script which applied the maternal adjustment is no longer being used We must now do this simple adjustment in Python. It's merely the sample size * asfr then cases / sample size """ locs = df['location_id'].unique().tolist() age_groups = df['age_group_id'].unique().tolist() years = df['year_start'].unique().tolist() asfr = get_covariate_estimates(covariate_id=13, decomp_step=decomp_step, sex_id=2, age_group_id=age_groups, location_id=locs, year_id=years) asfr = asfr[['location_id', 'year_id', 'age_group_id', 'sex_id', 'mean_value']] asfr.rename(columns={'year_id': 'year_start'}, inplace=True) return asfr
def get_5_year_haqi_cf(min_treat=10, max_treat=75): """ A function to get the health access quality covariates data which we'll use to divide our mean_raw values by to adjust our estimates Parameters: min_treat: float minimum access. Sets a floor for the CF. If 10 then the lowest possible CF will be 10, max_treat: float or int maximum acess. Sets a cap for the CF. If 75 then any loc/year with a covariate above 75 will have a CF of 1 and the data will be unchanged """ # get a dataframe of haqi covariate estimates df = get_covariate_estimates(covariate_id=1099) df.rename(columns={'year_id': 'year_start'}, inplace=True) # set the max value df.loc[df.mean_value > max_treat, 'mean_value'] = max_treat # get min df present in the data min_df = df.mean_value.min() # make the correction df['haqi_cf'] = \ min_treat + (1 - min_treat) * ((df['mean_value'] - min_df) / (max_treat - min_df)) # drop the early years so year binner doesn't break df['year_end'] = df['year_start'] df = df[df.year_start > 1987].copy() df = hosp_prep.year_binner(df) # Take the average of each 5 year band df = df.groupby(['location_id', 'year_start', 'year_end']).agg({ 'haqi_cf': 'mean' }).reset_index() assert df.haqi_cf.max() <= 1, "The largest haqi CF is too big" assert df.haqi_cf.min() >= min_treat, "The smallest haqi CF is too small" return df
def get_covariates(covariate_id, covariate_model_id, location_set_version_id, gbd_round_id, decomp_step_id, db_connection, standard_location_set_version_id): """ integer -> Pandas data frame Given an integer which represents a valid covariate ID will return a data frame which contains a unique value for each country, year, age group. This data may be aggregated in some form as well. """ logger.info('Getting covariate estimates for covariate_id {} and ' 'covariate_model_id {} and decomp step {}.'.format( covariate_id, covariate_model_id, decomp_step_id)) loc_df = get_location_info( location_set_version_id=location_set_version_id, standard_location_set_version_id=standard_location_set_version_id, db_connection=db_connection) loc_list = loc_df.location_id.values.tolist() df = get_covariate_estimates( covariate_id=covariate_id, model_version_id=covariate_model_id, location_set_version_id=location_set_version_id, decomp_step=decomp_step_from_decomp_step_id(decomp_step_id), gbd_round_id=gbd_round_id) df = df.loc[df.location_id.isin(loc_list)] df = df[[ 'covariate_name_short', 'age_group_id', 'sex_id', 'year_id', 'location_id', 'mean_value' ]] df.rename(columns={ 'age_group_id': 'age', 'sex_id': 'sex', 'year_id': 'year', 'covariate_name_short': 'name' }, inplace=True) df = df[['mean_value', 'age', 'sex', 'year', 'location_id', 'name']] df.rename(columns={'mean_value': df['name'].values[0]}, inplace=True) return df
def get_live_births(): live_birth_data = get_covariate_estimates(covariate_id=1106) # Filter down to Telangana and Andhra Pradesh temp_ap = live_birth_data.loc[live_birth_data['location_id'].isin([4841, 4871])].copy(deep=True) # Sum Telangana and Andhra Pradesh covariate numbers and population index_cols = ['model_version_id', 'covariate_id', 'covariate_name_short', 'location_id', 'location_name', 'year_id', 'age_group_id', 'age_group_name', 'sex_id'] data_cols = ['mean_value', 'lower_value', 'upper_value'] temp_ap['location_id'] = 44849 temp_ap['location_name'] = "Old Andhra Pradesh" temp_ap = temp_ap.groupby(index_cols)[data_cols].sum().reset_index() live_birth_data = pd.concat([live_birth_data, temp_ap]).reset_index(drop=True) live_birth_data = live_birth_data.sort_values(['location_id', 'year_id', 'sex_id']) live_birth_data.loc[live_birth_data['sex_id'] == 1, 'sex'] = "male" live_birth_data.loc[live_birth_data['sex_id'] == 2, 'sex'] = "female" live_birth_data.loc[live_birth_data['sex_id'] == 3, 'sex'] = "both" live_birth_data['year'] = live_birth_data['year_id'] live_birth_data['births'] = live_birth_data['mean_value'] return live_birth_data
def generate_covariate_data(): """.""" cov_df = get_covariate_estimates(covariate_id=881) cov_df = cov_df[(cov_df.age_group_id == 22) & (cov_df.year_id == 2016)] cov_df.rename(columns={'mean_value': 'sdi_value'}, inplace=True) return cov_df[['location_id', 'sdi_value']].copy(deep=True)
def execute(self): src_meid = 2929 src_mvid = 326309 trg_meid = 18777 out_dir = os.path.join(base, str(trg_meid)) if os.path.exists(out_dir): shutil.rmtree(out_dir) os.makedirs(out_dir) else: os.makedirs(out_dir) data_dir = base # where the loc_meta data will go # export data for individual job import to reduce hits to database # location_set_id 22 has the same locations as 35 keeps = [ 'location_id', 'super_region_id', 'super_region_name', 'region_id', 'region_name', 'ihme_loc_id' ] loc_meta = get_location_metadata(location_set_id=35, gbd_round_id=5) loc_list = loc_meta.loc[loc_meta.most_detailed == 1, 'location_id'].tolist() loc_meta = loc_meta[keeps] loc_meta.to_csv(os.path.join(data_dir, 'location_metadata.csv'), index=False, encoding='utf8') iod_salt_cov = 46 iod_salt = get_covariate_estimates(covariate_id=iod_salt_cov, gbd_round_id=5) iod_salt.to_csv(os.path.join(data_dir, 'iod_salt_cov.csv'), index=False, encoding='utf8') job_string = '' for loc in loc_list: job_name = "adjust_iodID_{}".format(loc) job_string = job_string + "," + job_name call = ('qsub -hold_jid {hj} -pe multi_slot 2' ' -cwd -P proj_custom_models' ' -o {o}' ' -e {e}' ' -N {jn}' ' stata_shell.sh' ' prep_cretin_dismod_output_for_upload.do' ' {arg1} {arg2} {arg3} {arg4}'.format(hj=self.hold, o=output_path, e=error_path, jn=job_name, arg1=out_dir, arg2=data_dir, arg3=loc, arg4=src_mvid)) subprocess.call(call, shell=True) # Save the results save_hold = job_string iod_adj_description = "Adjustments made on meid {}, mvid {}".format( src_meid, src_mvid) save_params = [ str(trg_meid), "--description", "\'{}\'".format(iod_adj_description), "--input_dir", out_dir, "--best", "--sexes", "1", "2", "--meas_ids", "5", "--file_pattern", "{measure_id}_{location_id}_{year_id}_{sex_id}.csv" ] save_job_name = 'iod_adj_save_{}'.format(trg_meid) call = ('qsub -hold_jid {hj} -pe multi_slot 13' ' -cwd -P proj_custom_models' ' -o {o}' ' -e {e}' ' -N {jn}' ' python_shell.sh' ' save.py' ' {arg1}'.format(hj=save_hold, o=output_path, e=error_path, jn=save_job_name, arg1=' '.join(save_params))) subprocess.call(call, shell=True) return save_job_name
def fix_maternal_denominators(df, return_only_maternal=False): asfr = get_covariate_estimates(QUERY) # keep age/location/year and the critical mean_value asfr = asfr[['location_id', 'year_id', 'age_group_id', 'sex_id', 'mean_value']] asfr.drop_duplicates(inplace=True) # map age_start and age_end onto asfr age_group = query("QUERY") pre_asfr = asfr.shape[0] asfr = asfr.merge(age_group, how='left', on='age_group_id') assert pre_asfr == asfr.shape[0],\ "The merge duplicated rows unexpectedly" asfr.drop('age_group_id', axis=1, inplace=True) asfr.rename(columns={'age_group_years_start': 'age_start', 'age_group_years_end': 'age_end'}, inplace=True) # create year_start and year_end asfr['year_start'] = asfr['year_id'] asfr['year_end'] = asfr['year_id'] asfr.drop('year_id', axis=1, inplace=True) # all the mean_values in asfr where age_end is less than one are 0, so we # can make up an asfr group for age start = 0 and age_end = 1 asfr.loc[asfr['age_end'] < 1, 'age_end'] = 1 asfr.loc[asfr['age_start'] < 1, 'age_start'] = 0 asfr.loc[asfr['age_end'] > 1, 'age_end'] = asfr.loc[asfr['age_end'] > 1, 'age_end'] - 1 # one more change, asfr has the max age end as 125 (now 124), and we want # it to be 99 asfr.loc[asfr['age_end'] == 124, 'age_end'] = 99 # now asfr age_start # and age_end match our hospital data # and incase we created duplicated rows by doing this: asfr.drop_duplicates(inplace=True) # MERGE ASFR ONTO HOSP pre_shape = df.shape[0] df = df.merge(asfr, how='left', on=['age_start', 'age_end', 'year_start', 'year_end', 'location_id', 'sex_id']) assert df.mean_value.isnull().sum() != df.shape[0],\ "The merge failed to attach any mean_values" assert pre_shape == df.shape[0],\ "The merge duplicated rows unexpectedly" # GET MATERNAL CAUSES # query causes causes = get_cause_metadata(QUERY) condition = causes.path_to_top_parent.str.contains("366") maternal_causes = causes[condition] # make list of maternal causes maternal_list = list(maternal_causes['cause_id'].unique()) maternal_df = df[df['cause_id'].isin(maternal_list)] # subset out rows that # are in maternal list assert maternal_df.shape[0] != 0,\ "The maternal dataframe is empty" df = df[~df['cause_id'].isin(maternal_list)] # subset out rows that # are not in the maternal list assert df.shape[0] != 0,\ "The hospital dataframe is empty" for cause in maternal_list: maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] =\ maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] /\ maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value'] maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] =\ maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] /\ maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value'] maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] =\ maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] /\ maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value'] # some mean_valued were zero, this is effectively an age/sex restriction # assign these a rate of 0 maternal_df.loc[(maternal_df['product'].isnull()) & (maternal_df['cause_id'] == cause), ['product', 'upper_product', 'lower_product']] = 0 # assign infinite values to 0 maternal_df.loc[(np.isinf(maternal_df['product'])) & (maternal_df['cause_id'] == cause), ['product', 'upper_product', 'lower_product']] = 0 if return_only_maternal == True: maternal_df.drop(['mean_value', 'cause_id'], axis=1, inplace=True) return(maternal_df) else: df = pd.concat([df, maternal_df]) # bring data back together # DROP ASFR info df.drop(['mean_value', 'cause_id'], axis=1, inplace=True) return(df)
def fix_maternal_denominators(df, return_only_maternal=False): # At this point, data will have bundle_id and cause_id on it, # but it has not been collapsed to those levels. it is at the # baby seq level but as of 4-24-2017 data will be at bundle level # 2) aquired asfr from the database # 3) attach age_start and age_end to asfr, and create year_start and # year_end out of year_id # 4) attach asfr to the hospital data # 5) where cause_id is a maternal cause, do the division # 6) Then drop all the asfr info. namely, 'mean_value' # GET ASFR # has age/location/year asfr = get_covariate_estimates(covariate_id=13) # keep age/location/year and the critical mean_value asfr = asfr[[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'mean_value' ]] asfr.drop_duplicates(inplace=True) # map age_start and age_end onto asfr age_group = query("QUERY") pre_asfr = asfr.shape[0] asfr = asfr.merge(age_group, how='left', on='age_group_id') assert pre_asfr == asfr.shape[0],\ "The merge duplicated rows unexpectedly" asfr.drop('age_group_id', axis=1, inplace=True) asfr.rename(columns={ 'age_group_years_start': 'age_start', 'age_group_years_end': 'age_end' }, inplace=True) # create year_start and year_end asfr['year_start'] = asfr['year_id'] asfr['year_end'] = asfr['year_id'] asfr.drop('year_id', axis=1, inplace=True) # The below commented out line of code was very wrong, asfr has three under # under one years old age groups, but our data is just 0-1 years old. # additionaly, this would turn age ends like 1 into zero, which is wrong. # asfr['age_end'] = asfr['age_end'] - 1 # all the mean_values in asfr where age_end is less than one are 0, so we # can make up an asfr group for age start = 0 and age_end = 1 asfr.loc[asfr['age_end'] < 1, 'age_end'] = 1 asfr.loc[asfr['age_start'] < 1, 'age_start'] = 0 # THIS IS SO IMPORTANT, our data has # age_end as 14, 19, 24, while asfr has age_end as 15, 20, 25 ... asfr.loc[asfr['age_end'] > 1, 'age_end'] = asfr.loc[asfr['age_end'] > 1, 'age_end'] - 1 # one more change, asfr has the max age end as 125 (now 124), and we want # it to be 99 asfr.loc[asfr['age_end'] == 124, 'age_end'] = 99 # now asfr age_start # and age_end match our hospital data # and incase we created duplicated rows by doing this: asfr.drop_duplicates(inplace=True) # MERGE ASFR ONTO HOSP pre_shape = df.shape[0] df = df.merge(asfr, how='left', on=[ 'age_start', 'age_end', 'year_start', 'year_end', 'location_id', 'sex_id' ]) assert df.mean_value.isnull().sum() != df.shape[0],\ "The merge failed to attach any mean_values" assert pre_shape == df.shape[0],\ "The merge duplicated rows unexpectedly" # GET MATERNAL CAUSES # query causes causes = get_cause_metadata(cause_set_id=9) condition = causes.path_to_top_parent.str.contains("366") # 366 happens # to always be in the third level # subset just causes that meet the condition sdf maternal_causes = causes[condition] # make list of maternal causes maternal_list = list(maternal_causes['cause_id'].unique()) # subset out parts of data that have asfr info # loop over cause_ids that are in maternal_list # divide 'mean' by 'mean_value' and overwrite mean, upper, or lower, # as relevant. maternal_df = df[df['cause_id'].isin( maternal_list)] # subset out rows that # are in maternal list assert maternal_df.shape[0] != 0,\ "The maternal dataframe is empty" df = df[~df['cause_id'].isin(maternal_list)] # subset out rows that # are not in the maternal list assert df.shape[0] != 0,\ "The hospital dataframe is empty" for cause in maternal_list: # the line breaks are weird looking but this is just assiging a value # to the result of division maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] =\ maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] /\ maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value'] maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] =\ maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] /\ maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value'] maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] =\ maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] /\ maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value'] # some mean_valued were zero, this is effectively an age/sex restriction # assign these a rate of 0 maternal_df.loc[(maternal_df['product'].isnull()) & (maternal_df['cause_id'] == cause), ['product', 'upper_product', 'lower_product']] = 0 # assign infinite values to 0 maternal_df.loc[(np.isinf(maternal_df['product'])) & (maternal_df['cause_id'] == cause), ['product', 'upper_product', 'lower_product']] = 0 if return_only_maternal == True: maternal_df.drop(['mean_value', 'cause_id'], axis=1, inplace=True) return (maternal_df) else: df = pd.concat([df, maternal_df]) # bring data back together # DROP ASFR info df.drop(['mean_value', 'cause_id'], axis=1, inplace=True) return (df)