def get_epi_inc(dim, cv, years, codes, ndraws): # grab the age ids age_ids = pd.read_csv(os.path.join( code_dir, "convert_to_new_age_ids.csv")).rename(columns={'age_start': 'age'}) # get the EN incidence -- not short-term or long-term because all incidence is captured in short-term estim_sg = SuperGopher({'file_pattern': 'FILEPATH'}, os.path.join('FILEPATH')) estim_df = estim_sg.content( location_id=dim.index_dim.get_level("location_id"), year_id=years, sex_id=dim.index_dim.get_level("sex_id")) # merge on age group ids estim_df = pd.merge(estim_df, age_ids, on='age') estim_df.drop('age', inplace=True, axis=1) # keep age group id 2 and triplicate so that we can have 3 sets # this is to redistribute the incidence in under 1 age groups # with population fractions for age groups 2 3 and 4 todupe1 = estim_df.ix[(estim_df['age_group_id'] == 2)] todupe1['age_group_id'] = 3 todupe2 = estim_df.ix[(estim_df['age_group_id'] == 2)] todupe2['age_group_id'] = 4 estim_df = estim_df.append(todupe1) estim_df = estim_df.append(todupe2) # get the population -- don't query database every time pops = pd.read_stata(os.path.join(root_j_dir, "FILEPATH")) # MAKE POPULATION FRACTIONS fullpops = pd.merge(pops, age_ids, on='age_group_id') fullpops['collapsed_age'] = fullpops['age'] fullpops.loc[fullpops.age < 1, 'collapsed_age'] = 0 pops = fullpops.copy() pops.loc[pops.age < 1, 'age'] = 0 pops = pops[['location_id', 'year_id', 'sex_id', 'age', 'population']] pops = pops.groupby(['location_id', 'year_id', 'sex_id', 'age']).sum().reset_index() pops = pops.rename(columns={ 'population': 'total_pop', 'age': 'collapsed_age' }) popfracts = pd.merge( fullpops, pops, on=['location_id', 'year_id', 'collapsed_age', 'sex_id']) popfracts[ 'pop_fraction'] = popfracts['population'] / popfracts['total_pop'] popfracts = popfracts[[ 'age_group_id', 'location_id', 'year_id', 'sex_id', 'pop_fraction' ]] # redistribute inc. under 1 (ALL OF THE OTHER POP FRACTIONS SHOULD BE 1) estim_df = pd.merge( estim_df, popfracts, how='left', on=['location_id', 'year_id', 'sex_id', 'age_group_id']) cols = ["draw_" + ` i ` for i in range(0, 999)]
def compute_global_ratios(year_id, drawcols): eng = ezfuncs.get_engine(conn_def="cod") ccv = pd.read_sql(""" SELECT output_version_id FROM cod.output_version WHERE code_version=4 AND is_best=1""", eng).squeeze() sg = SuperGopher({ 'file_pattern': '{measure_id}_{location_id}.h5', 'h5_tablename': 'draws'}, 'filepath/codcorrect/{ccv}/draws'.format(ccv=ccv)) ylls = sg.content(location_id=1, year_id=year_id, sex_id=[1, 2], measure_id=4) ratios = [] for resid_cid, yldmap in rkey.groupby('input_cause_id'): # get the ylls these_ylls = ylls[ylls.cause_id == resid_cid] ratio_ylls = ylls[ylls.cause_id.isin(yldmap.ratio_cause_id.unique())] # aggregate the inputs to the appropriate level group_cols = ['age_group_id', 'year_id'] these_ylls = these_ylls.groupby(group_cols) these_ylls = these_ylls[drawcols].sum().mean(axis=1) ratio_ylls = ratio_ylls.groupby(group_cols) ratio_ylls = ratio_ylls[drawcols].sum().mean(axis=1) # compute the ratio ratio = these_ylls / ratio_ylls ratio = ratio.reset_index() ratio = ratio.replace(np.inf, 0) ratio = ratio.replace(np.NaN, 0) ratio["cause_id"] = resid_cid ratios.append(ratio) df = pd.concat(ratios) df_male = df.copy() df_male["sex_id"] = 1 df_female = df.copy() df_female["sex_id"] = 2 return df_male.append(df_female)
def get_annual_inc(dim, cv, years, codes, ndraws): location_id = dim.index_dim.get_level("location_id")[0] sex_id = dim.index_dim.get_level("sex_id")[0] year_id = dim.index_dim.get_level("year_id")[0] # get inpatient incidence annual_inp = SuperGopher({'file_pattern': 'FILEPATH'}, os.path.join('FILEPATH')) annual_inp_df = annual_inp.content( location_id=dim.index_dim.get_level("location_id"), sex_id=dim.index_dim.get_level("sex_id")) annual_inp_df = annual_inp_df.loc[(annual_inp_df.year_id == year_id)] # get outpatient incidence annual_otp = SuperGopher({'file_pattern': 'FILEPATH'}, os.path.join('FILEPATH')) annual_otp_df = annual_otp.content( location_id=dim.index_dim.get_level("location_id"), sex_id=dim.index_dim.get_level("sex_id")) annual_otp_df = annual_otp_df.loc[(annual_otp_df.year_id == year_id)] # bind the inpatient and outpatient data frames together and groupby to collapse over inpatient annual_df = annual_otp_df.append(annual_inp_df) annual_df.drop(['inpatient'], inplace=True, axis=1) annual_df = injize(annual_df, dim, cv, measure=6, years=years, fixage=True, codes=codes, shock=True, ndraws=ndraws) return annual_df
def get_epi_prev(dim, cv, years, codes, ndraws): # grab the short-term EN prevalence estim_st = SuperGopher({'file_pattern': 'FILEPATH'}, os.path.join("FILEPATH")) estim_st_df = estim_st.content( location_id=dim.index_dim.get_level("location_id"), year_id=years, sex_id=dim.index_dim.get_level("sex_id")) estim_st_df["term"] = "short-term" # grab the long-term EN prevalence estim_lt = SuperGopher({'file_pattern': 'FILEPATH'}, os.path.join("FILEPATH")) estim_lt_df = estim_lt.content( location_id=dim.index_dim.get_level("location_id"), year_id=years, sex_id=dim.index_dim.get_level("sex_id")) estim_lt_df["term"] = "long-term" estim_lt_df = estim_lt_df.loc[(estim_lt_df.ecode != "inj_war_warterror") & (estim_lt_df.ecode != "inj_war_execution") & (estim_lt_df.ecode != "inj_disaster")] # append short-term and long-term prevalence datasets together estim_df = estim_st_df.append(estim_lt_df) # these columns are unnecessary estim_df.drop(['prob_draw_', 'inpatient'], axis=1, inplace=True) # injury-ize based on the gbd requirements estim_df = injize(estim_df, dim, cv, measure=5, years=years, fixage=True, codes=codes, shock=False, ndraws=ndraws) return estim_df
def get_annual_prev(dim, cv, years, codes, ndraws): # set vars location_id = dim.index_dim.get_level("location_id")[0] sex_id = dim.index_dim.get_level("sex_id")[0] year_id = dim.index_dim.get_level("year_id")[0] # get short-term annual results annual_st = SuperGopher({'file_pattern': 'FILEPATH'}, os.path.join('FILEPATH')) annual_st_df = annual_st.content( location_id=dim.index_dim.get_level("location_id"), year_id=dim.index_dim.get_level("year_id"), sex_id=dim.index_dim.get_level("sex_id")) annual_st_df["term"] = "short-term" # get long-term annual results annual_lt = SuperGopher({'file_pattern': 'FILEPATH'}, os.path.join('FILEPATH')) annual_lt_df = annual_lt.content( location_id=dim.index_dim.get_level("location_id"), year_id=dim.index_dim.get_level("year_id"), sex_id=dim.index_dim.get_level("sex_id")) annual_lt_df["term"] = "long-term" # combine them in one data frame annual_df = annual_st_df.append(annual_lt_df) # drop them annual_df.drop(['inpatient', 'prob_draw_', 'term'], axis=1, inplace=True) # injury-ize based on the gbd requirements annual_df = injize(annual_df, dim, cv, measure=5, years=years, fixage=True, codes=codes, shock=True, ndraws=ndraws) return annual_df
def _get_short_term_EN_annual(self, dim): # get non interpolated values annual_sg = SuperGopher( {'file_pattern': '{location_id}/ylds_{year_id}_{sex_id}.dta'}, os.path.join("filepath", "FILEPATH")) annual_df = annual_sg.content( location_id=dim.index_dim.get_level("location_id"), year_id=dim.index_dim.get_level("year_id"), sex_id=dim.index_dim.get_level("sex_id")) # clean data annual_df = annual_df.merge(self.como_version.cause_list, left_on="ecode", right_on="acause") annual_df = annual_df.merge(self.como_version.ncode_hierarchy, left_on="ncode", right_on="rei") annual_df["age"] = annual_df["age"].round(2).astype(str) ridiculous_am = { '0.0': 2, '0.01': 3, '0.1': 4, '1.0': 5, '5.0': 6, '10.0': 7, '15.0': 8, '20.0': 9, '25.0': 10, '30.0': 11, '35.0': 12, '40.0': 13, '45.0': 14, '50.0': 15, '55.0': 16, '60.0': 17, '65.0': 18, '70.0': 19, '75.0': 20, '80.0': 30, '85.0': 31, '90.0': 32, '95.0': 235 } annual_df["age"] = annual_df["age"].replace(ridiculous_am).astype(int) annual_df.rename(columns={"age": "age_group_id"}, inplace=True) # transform to rate annual_df = transform_metric(annual_df, 3, 1) # collapse inpatient annual_df = annual_df.groupby([ "location_id", "year_id", "age_group_id", "sex_id", "cause_id", "rei_id" ]).sum().reset_index() # fill demographics gbdizer = gbdize.GBDizeDataFrame(dim) annual_df = gbdizer.add_missing_index_cols(annual_df) annual_df = gbdizer.gbdize_any_by_dim(annual_df, "age_group_id") annual_df.fillna(0, inplace=True) # resample if necessary annual_df = self.resample_if_needed(annual_df, dim, gbdizer) return annual_df
def _get_short_term_EN_estimation(self, dim): # get non interpolated values estim_sg = SuperGopher( {'file_pattern': '{location_id}/ylds_{year_id}_{sex_id}.dta'}, os.path.join("filepath", "03_outputs/01_draws/ylds")) years = list( set( cap_val(dim.index_dim.levels.year_id, [1990, 1995, 2000, 2005, 2010, 2016]) + [2005])) estim_df = estim_sg.content( location_id=dim.index_dim.get_level("location_id"), year_id=years, sex_id=dim.index_dim.get_level("sex_id")) # clean data estim_df = estim_df.merge(self.como_version.cause_list, left_on="ecode", right_on="acause") estim_df = estim_df.merge(self.como_version.ncode_hierarchy, left_on="ncode", right_on="rei") estim_df["age"] = estim_df["age"].round(2).astype(str) ridiculous_am = { '0.0': 2, '0.01': 3, '0.1': 4, '1.0': 5, '5.0': 6, '10.0': 7, '15.0': 8, '20.0': 9, '25.0': 10, '30.0': 11, '35.0': 12, '40.0': 13, '45.0': 14, '50.0': 15, '55.0': 16, '60.0': 17, '65.0': 18, '70.0': 19, '75.0': 20, '80.0': 30, '85.0': 31, '90.0': 32, '95.0': 235 } estim_df["age"] = estim_df["age"].replace(ridiculous_am).astype(int) estim_df.rename(columns={"age": "age_group_id"}, inplace=True) # transform to rate estim_df = transform_metric(estim_df, 3, 1) # collapse inpatient estim_df = estim_df.groupby([ "location_id", "year_id", "age_group_id", "sex_id", "cause_id", "rei_id" ]).sum().reset_index() # fill demographics data_cols = ["draw_{}".format(i) for i in range(1000)] gbdizer = gbdize.GBDizeDataFrame(dim) estim_df = gbdizer.add_missing_index_cols(estim_df) estim_df = gbdizer.gbdize_any_by_dim(estim_df, "age_group_id") estim_df.fillna(0, inplace=True) if gbdizer.missing_values(estim_df, "year_id"): estim_df = gbdizer.fill_year_by_interpolating( df=estim_df, rank_df=estim_df[estim_df["year_id"] == 2005], data_cols=data_cols) estim_df = estim_df[estim_df.year_id.isin( dim.index_dim.get_level("year_id"))] # resample if necessary estim_df = self.resample_if_needed(estim_df, dim, gbdizer) return estim_df
def read_inputs(self): """get como draws for a single modelable_entity/model_version""" print('Reading draws for (meid, mvid): ({}, {})'.format( self.meid, self.mvid)) if self.super_gopher is None: self.super_gopher = SuperGopher.auto(self.meid_data_dir) all_draws = [] reference_draws = [] missing_dim_q = [] for dimensions in self.dimensions_q: gbdizer = gbdize.GBDizeDataFrame(dimensions) try: draws = self.super_gopher.content( location_id=dimensions.index_dim.get_level("location_id"), year_id=dimensions.index_dim.get_level("year_id"), sex_id=dimensions.index_dim.get_level("sex_id"), measure_id=dimensions.index_dim.get_level("measure_id"), age_group_id=dimensions.index_dim.get_level( "age_group_id")) except InvalidFilter: draws = pd.DataFrame(columns=dimensions.index_names) if not draws.empty: # gbdize. aka fill in missing dimensions draws = self.gbdize_dimensions(draws, gbdizer) # keep a copy of all 1000 draws for interpolation reference_draws.append(draws) # resample draws = self.resample_if_needed(draws, dimensions, gbdizer) if len(draws) != dimensions.total_cardinality: missing = self.missing_dimensions(draws, dimensions) missing_dim_q.append(missing) all_draws.append(draws) # prep for interpolation of missing demographics if len(reference_draws) > 0: reference_draws = pd.concat(reference_draws) else: reference_draws = pd.DataFrame(columns=dimensions.index_names) missing_dim_q = list(flatten(missing_dim_q)) for dimensions in missing_dim_q: gbdizer = gbdize.GBDizeDataFrame(dimensions) interp_draws, rank_df = self.get_interpolation_draws( reference_draws, dimensions) if not interp_draws.empty: # gbdize. aka fill in missing dimensions interp_draws = self.gbdize_dimensions(interp_draws, gbdizer) rank_df = self.gbdize_dimensions(rank_df, gbdizer) # case where years are stored as floats, breaks interpolate interp_draws['year_id'] = interp_draws['year_id'].astype(int) try: data_cols = ["draw_{}".format(i) for i in range(1000)] interp_draws = gbdizer.fill_year_by_interpolating( interp_draws, rank_df, data_cols) except MissingGBDemographics: print( "(meid: {meid}, mvid: {mvid}) " " Could not interpolate for years: {years}, " "measure: {meas} " "location_id: {loc} " "sex_id: {sex}".format( meid=self.meid, mvid=self.mvid, years=dimensions.index_dim.get_level("year_id"), meas=dimensions.index_dim.get_level("measure_id"), loc=dimensions.index_dim.get_level("location_id"), sex=dimensions.index_dim.get_level("sex_id"))) interp_draws = self.gbdize_dimensions( interp_draws, gbdizer, "year_id") # append draws to reference reference_draws = reference_draws.append(interp_draws, ignore_index=True) draws = interp_draws.loc[interp_draws['year_id'].isin( dimensions.index_dim.get_level('year_id'))] # resample draws = self.resample_if_needed(draws, dimensions, gbdizer) all_draws.append(draws) # if dimensions overlap, drop duplicates from reference draws reference_draws.drop_duplicates(subset=dimensions.index_names, inplace=True) # concatenate all the results draws = pd.concat(all_draws) # in case dimensions overlap, drop duplicates draws.drop_duplicates(inplace=True) draws['modelable_entity_id'] = self.meid return draws.reset_index(drop=True)