def main(ecode, year_id, sex_id, decomp, version): tic = time.time() m_draws = get_measures(ecode, year_id, sex_id, decomp, version) x_inc = etl.df_to_xr(m_draws['incidence'], wide_dim_name='draw', fill_value=np.nan ) x_rem = etl.df_to_xr(m_draws['remission'], wide_dim_name='draw', fill_value=np.nan ) x_emr = etl.df_to_xr(m_draws['emr'], wide_dim_name='draw', fill_value=np.nan ) out_coeff = pd.read_csv(paths.OUTPATIENT_COEFFICIENTS / f"{ecode}.csv") age_to_out_coeff = dict(zip(out_coeff.age_group_id, out_coeff.out_coeff)) sys.stdout.flush() adjusted_inc = calculate_measures.short_term_incidence_unsplit( x_inc, x_rem, x_emr, age_to_out_coeff ) write_results(adjusted_inc, ecode, decomp, version, year_id, sex_id) toc = time.time() total = toc - tic
def create_age_sex_xarray(): LOGGER.debug("Creating xarray of age-sex patterns for migration") # load patterns qatar = pd.read_csv(QATAR_PATTERN) eurostat = pd.read_csv(EUROSTAT_PATTERN) # convert to xarrays qatar = df_to_xr(qatar, dims=PATTERN_ID_VARS) eurostat = df_to_xr(eurostat, dims=PATTERN_ID_VARS) # create superarray to hold all locs all_locs_xr_list = [] # Put dataframes for each location into a list for loc in WPP_LOCATION_IDS: if loc in QATAR_LOCS: data = qatar else: data = eurostat data = expand_dimensions(data, location_id=[loc]) all_locs_xr_list.append(data) # Concat all locations together result = xr.concat(all_locs_xr_list, dim='location_id') # Save all locs pattern LOGGER.debug("Saving age-sex pattern xarray") pattern_dir = FBDPath(f'/{gbd_round_id}/future/migration/' f'{PATTERN_VERSION}') pattern_path = pattern_dir / f"combined_age_sex_pattern.nc" save_xr(pattern, pattern_path, metric="percent", space="identity") LOGGER.debug("Saved age-sex pattern xarray") return result
def smr(ncode): smr = pd.read_csv(os.path.join(paths.INPUT_DIR, 'FILEPATH.csv')).drop('name', axis=1) if ncode == "N48": smr = smr.loc[smr["ncode"] == "N9"] else: smr = smr.loc[smr["ncode"] == ncode] smr["se"] = (smr["UL"] - smr["LL"]) / 3.92 smr["ncode"] = ncode # generate draws of SMR smr.reset_index( drop=True, inplace=True ) # need to reset index so that the random draws will line up np.random.seed(659177) smr[help.drawcols()] = pd.DataFrame( np.random.normal(smr['SMR'], smr['se'], size=(1000, len(smr))).T) smr.drop(['SMR', 'UL', 'LL', 'se'], inplace=True, axis=1) smr = help.convert_to_age_group_id(smr, collapsed_0=False) smr.set_index(['ncode', 'age_group_id'], inplace=True) smr[smr < 1] = 1 return etl.df_to_xr(smr, wide_dim_name='draw', fill_value=np.nan)
def save_mortality(ecode, year_id, sex_id, locs, ages, version): cause_id = help.get_cause(ecode) draws = gd.get_draws( gbd_id_type="cause_id", gbd_id=cause_id, location_id=locs, year_id=year_id, sex_id=sex_id, age_group_id=ages, status="best", source="codem", gbd_round_id=help.GBD_ROUND ) draws[help.drawcols()] = draws[help.drawcols()].divide(draws['pop'], axis=0) draws.drop(['pop', 'envelope', 'cause_id', 'sex_name', 'measure_id', 'metric_id'], axis=1, inplace=True) draws.set_index(['location_id','year_id','sex_id','age_group_id'], inplace=True) mort = etl.df_to_xr(draws, wide_dim_name='draw', fill_value=np.nan) filename = "FILEPATH.nc".format(str(year_id), str(sex_id)) folder = os.path.join("FILEPATH") if not os.path.exists(folder): try: os.makedirs(folder) except OSError as e: if e.errno != os.errno.EEXIST: raise pass filepath = os.path.join(folder, filename) print("Writing mortality") mort.to_netcdf(filepath)
def en_matrix(ecode, sex_id, platform): if platform == "inpatient": short_plat = "inp" else: short_plat = "otp" if inj_info.ECODE_PARENT[ecode] == 'inj_poisoning': e = 'inj_poisoning' else: e = ecode en_mat_dir = "FILEPATH" matrix = pd.read_csv(os.path.join(en_mat_dir, short_plat, e + ".csv")) matrix = matrix.loc[matrix['sex'] == sex_id] matrix = help.convert_to_age_group_id(matrix, collapsed_0=True) matrix = help.expand_under_1(matrix) matrix.drop(['ecode'], inplace=True, axis=1) matrix.rename(columns={ 'n_code': 'ncode', 'inpatient': 'platform', 'sex': 'sex_id' }, inplace=True) plat_dict = {1: 'inpatient', 0: 'outpatient'} matrix['platform'] = matrix['platform'].replace(plat_dict) matrix.set_index( ['ncode', 'platform', 'high_income', 'sex_id', 'age_group_id'], inplace=True) x_matr = etl.df_to_xr(matrix, wide_dim_name='draw', fill_value=0) return x_matr
def combine_and_save_mig(version): """ Load location csvs of migration files and combine into an xarray dataarray. Args: version (str): The version of migration to combine and save Returns: xarray.DataArray: The combined migration data xarray dataarray. """ LOGGER.debug("Combining migration csvs to xarray") all_locs_xr_list = [] # Put dataframes for each location into a list for loc in WPP_LOCATION_IDS: temp = pd.read_csv(f'filepath') #temp = temp.set_index(ID_VARS) temp = df_to_xr(temp, dims=ID_VARS) all_locs_xr_list.append(temp) # Concat all locations together result = xr.concat(all_locs_xr_list, dim='location_id') # Save to forecasting directory result.to_netcdf(f'filepath') return result
def save_mortality(ecode, year_id, sex_id, locs, ages, decomp, version): cause_id = help.get_cause(ecode) draws = gd.get_draws( gbd_id_type="cause_id", gbd_id=cause_id, location_id=locs, year_id=year_id, sex_id=sex_id, age_group_id=ages, status="best", source="codem", gbd_round_id=help.GBD_ROUND, decomp_step=decomp ) draws[help.drawcols()] = draws[help.drawcols()].divide(draws['pop'], axis=0) draws.drop(['pop', 'envelope', 'cause_id', 'sex_name', 'measure_id', 'metric_id'], axis=1, inplace=True) draws.set_index(['location_id','year_id','sex_id','age_group_id'], inplace=True) mort = etl.df_to_xr(draws, wide_dim_name='draw', fill_value=np.nan) filename = "mort_{}_{}.nc".format(str(year_id), str(sex_id)) version = version.rstrip() folder = os.path.join(paths.DATA_DIR, decomp, inj_info.ECODE_PARENT[ecode], version, 'mortality_for_shocks') if not os.path.exists(folder): try: os.makedirs(folder) except OSError as e: if e.errno != os.errno.EEXIST: raise pass filepath = os.path.join(folder, filename) mort.to_netcdf(filepath)
def main(ecode, year_id, sex_id, version): tic = time.time() me_id = help.get_me(ecode) m_draws = get_measures(ecode, me_id, year_id, sex_id, version) x_inc = etl.df_to_xr(m_draws['incidence'], wide_dim_name='draw', fill_value=np.nan) x_rem = etl.df_to_xr(m_draws['remission'], wide_dim_name='draw', fill_value=np.nan) x_emr = etl.df_to_xr(m_draws['emr'], wide_dim_name='draw', fill_value=np.nan) otp_cov = outpatient_cov(me_id, help.drawcols()) adjusted_inc = calculate_measures.short_term_incidence_unsplit(x_inc, x_rem, x_emr, otp_cov) write_results(adjusted_inc, ecode, version, year_id, sex_id) toc = time.time() total = toc - tic print("Total time was {} seconds".format(total))
def output_to_xarray(gbd_round, out, version_out): asfr_path = FBDPath("/{gri}/future/asfr/{version}".format( gri=gbd_round, version=version_out)) dims = ['location_id', 'year_id', 'scenario', 'age_group_id', 'sex_id', 'draw'] out_xr = df_to_xr(out, dims = dims) save_xr(out_xr, fbdpath = asfr_path / "asfr.nc", metric="rate", space="identity", version="version", model="asfr_adjusted_to_tfr_plus_point1_if_below2")
def get_shock_mort(ecode, pops, locs, ages, year_id, sex_id, decomp): cause_id = help.get_cause(ecode) draws = gd.get_draws(gbd_id_type="cause_id", gbd_id=cause_id, version_id=model_versions[ecode][sex_id], location_id=locs, year_id=year_id, age_group_id=ages, measure_id=1, source="codem", gbd_round_id=help.GBD_ROUND, decomp_step=model_versions['decomp']) if ecode == 'inj_war_execution': draws = draws.loc[(draws.age_group_id != 2) & (draws.age_group_id != 3), ] sub = draws[draws['age_group_id'] == 4] oth_cols = [col for col in sub.columns if 'draw_' not in col] sub.set_index(oth_cols, inplace=True) sub[:] = 0 sub = sub.reset_index() sub['age_group_id'] = 2 draws = draws.append(sub) sub = draws[draws['age_group_id'] == 4] oth_cols = [col for col in sub.columns if 'draw_' not in col] sub.set_index(oth_cols, inplace=True) sub[:] = 0 sub = sub.reset_index() sub['age_group_id'] = 3 draws = draws.append(sub) draws.drop(['cause_id', 'measure_id', 'metric_id', 'sex_name'], axis=1, inplace=True) draws.set_index(['location_id', 'year_id', 'sex_id', 'age_group_id'], inplace=True) mort = etl.df_to_xr(draws, wide_dim_name='draw', fill_value=np.nan) mort = mort / pops['population'] # gets it into rate space return mort
def write_results(df, ecode, ncode, platform, year, version): """Function to write the results that are appended from the DisMod ODE.""" out_dir = os.path.join("FILEPATH") if not os.path.exists(out_dir): try: os.makedirs(out_dir) except OSError as e: if e.errno != os.errno.EEXIST: raise pass filename = "FILEPATH.nc".format(year) filepath = os.path.join(out_dir, filename) df = help.convert_to_age_group_id(df) df.set_index(['location_id', 'year_id', 'sex_id', 'age_group_id', 'platform'], inplace = True) arr = etl.df_to_xr(df, wide_dim_name='draw', fill_value=np.nan) arr.to_netcdf(filepath)
def outpatient_cov(me_id, drawcols): """ This function returns a single scalar value for the outpatient covariate """ # get the best model version from DisMod panda cascade model_version = db.get_best_model_versions( entity='modelable_entity', ids=me_id, status='best', gbd_round_id=help.GBD_ROUND)['model_version_id'].iloc[0].astype(str) filepath = os.path.join( 'FILEPATH', model_version, 'FILEPATH.csv.gz') covars = pd.read_csv(filepath)['beta_incidence_x_s_outpatient'] covars = np.exp(covars)[-1000:] covars = pd.DataFrame(covars) covars['draw'] = drawcols covars.set_index('draw', inplace=True) x_outcov = etl.df_to_xr(covars) return x_outcov
def read_to_xr(location_id, ccf_asfr_fbd_path, dims): """ Reads location-specific csv file into pd.DataFrame, and then return as xr.DataArray. Args: location_id (int): location id. ccf_asfr_fbd_path (FBDPath): folder path within ccf stage. Contains an "asfr_single_year" or "asfr" sub-folder, where location-specific csv files are stored. dims (list[str]): list of dims expected from upstream asfr csv files. Returns: (xr.DataArray): location-specific asfr, not yet intercept-shifted. """ file_fbd_path = ccf_asfr_fbd_path / f"{location_id}.csv" asfr = pd.read_csv(file_fbd_path) asfr = df_to_xr(asfr, dims=dims).sel(location_id=location_id) return asfr
def en_matrix(ecode, sex_id, platform): """Returns e/n matrix, used to split an ecode into the 47 ncodes. Matrices vary by ecode, platform, sex, age and high income/not high income (the super region) countries. """ if platform == "inpatient": short_plat = "inp" else: short_plat = "otp" # read the matrix file for the specified platform if inj_info.ECODE_PARENT[ecode] == 'inj_poisoning': e = 'inj_poisoning' else: e = ecode en_mat_dir = "FILEPATH" matrix = pd.read_csv(os.path.join(en_mat_dir, "FILEPATH.csv")) # subset the matrix to requested sex, and convert from ages to age_group_ids (copying values for under 1 age groups) matrix = matrix.loc[matrix['sex'] == sex_id] matrix = help.convert_to_age_group_id(matrix, collapsed_0=True) matrix = help.expand_under_1(matrix) # format matrix for incidence merge matrix.drop(['ecode'], inplace=True, axis=1) matrix.rename(columns={ 'n_code': 'ncode', 'inpatient': 'platform', 'sex': 'sex_id' }, inplace=True) plat_dict = {1: 'inpatient', 0: 'outpatient'} matrix['platform'] = matrix['platform'].replace(plat_dict) matrix.set_index( ['ncode', 'platform', 'high_income', 'sex_id', 'age_group_id'], inplace=True) x_matr = etl.df_to_xr(matrix, wide_dim_name='draw', fill_value=0) return x_matr
def write_results(df, ecode, ncode, platform, year, decomp, version): version = version.rstrip() out_dir = os.path.join(paths.DATA_DIR, decomp, inj_info.ECODE_PARENT[ecode], str(version), "ode", str(ecode), str(ncode), platform) if not os.path.exists(out_dir): try: os.makedirs(out_dir) except OSError as e: if e.errno != os.errno.EEXIST: raise pass df = help.convert_to_age_group_id(df) df.set_index( ['location_id', 'year_id', 'sex_id', 'age_group_id', 'platform'], inplace=True) arr = etl.df_to_xr(df, wide_dim_name='draw', fill_value=np.nan) filepath = write_path(ecode, ncode, platform, year, decomp, version) arr.to_netcdf(filepath)
import numpy as np import os # Load the treated and untreated disability weights dw_folder = os.path.join("FILEPATH") untreat_dw = pd.read_csv(os.path.join(dw_folder, 'FILEPATH.csv')) treated_dw = pd.read_csv(os.path.join(dw_folder, 'FILEPATH.csv')) coldict = {'draw' + str(n): 'draw_' + str(n) for n in range(1000)} coldict['n_code'] = 'ncode' untreat_dw.rename(columns=coldict, inplace=True) treated_dw.rename(columns=coldict, inplace=True) untreat_dw.set_index('ncode', inplace=True) treated_dw.set_index('ncode', inplace=True) u_dw = etl.df_to_xr(untreat_dw, wide_dim_name='draw', fill_value=np.nan) t_dw = etl.df_to_xr(treated_dw, wide_dim_name='draw', fill_value=np.nan) dems = db.get_demographics(gbd_team='epi', gbd_round_id=help.GBD_ROUND) # Get the percent treated in each country-year, and multiply by dws to get total dw p_t = calculate_measures.pct_treated() dw = t_dw * p_t + u_dw * (1 - p_t) # Load in split proportions for spinal cord injuries and find weighted average disability weight among the 4 splits n_parent = pd.Series(index=treated_dw.index, data=[n[0:3] for n in treated_dw.index], name='ncode_parent') spinal_split_folder = 'FILEPATH' drawdict = {'prop_' + d: d for d in help.drawcols()} split_props_list = []
def disability_weights_st(): folder = 'FILEPATH' return etl.df_to_xr(pd.read_csv(os.path.join( folder, 'FILEPATH.csv')).set_index('ncode'), wide_dim_name='draw', fill_value=np.nan)
def get_gbd_paf(acause, rei, cache_version, gbd_round_id, sex_ids, location_ids, draws, measure_id=4, metric_id=2): """ Downloads and transforms gbd cause-risk-specific PAF. The dataarray is then cleaned and saved in a FBDPath. The gbd paf coming from get_draws:: >>> df.columns Index([u'rei_id', u'modelable_entity_id', u'location_id', u'year_id', u'age_group_id', u'sex_id', u'cause_id', u'measure_id', u'draw_0', u'draw_1', ... u'draw_991', u'draw_992', u'draw_993', u'draw_994', u'draw_995', u'draw_996', u'draw_997', u'draw_998', u'draw_999', u'metric_id'], dtype='object', length=1009) where we will need to 1.) use cause_id to slice for the cause-risk pair 2.) use measure_id (typically 4 for yll) to slice for measure_id 3.) use metric_id (typically 2 for percent) to slice for metric_id Args: acause (str): analytical cause. rei (str): risk, could also be vaccine intervention. cache_version (str): the FBDPath paf version to save the gbd paf in, or to read from. gbd_round_id (int): gbd round id sex_ids (list): sexes. Typically [1, 2]. location_ids (list): locations to get pafs from. draws (int): number of draws for output file. This means input files will be up/down-sampled to meet this criterion. measure_id (int, optional): typically the yll measure id (4). At the most detailed PAF yll is equivalent to death, so measure_id 4 works the same as measure_id 1 (death). Empirically, it seems to pull data faster if calling with meausre_id=4. metric_id (int, optional): typically the percent metric (2) Returns: (xr.DataArray/None): Dataarray with complete demographic indices, sans "scenario". """ if rei in get_vaccine_reis(gbd_round_id): # get_draws won't have anything for vaccines return None cache_file_fbdpath =\ FBDPath(gbd_round_id=gbd_round_id, past_or_future="past", stage="paf", version=cache_version) / (acause + "_" + rei + ".nc") if cache_file_fbdpath.exists(): LOGGER.info("{} already exists. Will read from it for gbd paf.". format(cache_file_fbdpath)) paf_da = open_xr(cache_file_fbdpath).data paf_da = paf_da.sel(location_id=location_ids) if len(paf_da["draw"]) != draws: paf_da = resample(paf_da, draws) return paf_da else: # no cache exists, must download & clean rei_id = get_rei_id(rei) if acause in CAUSES_NOT_IN_GBD_MAP: # edge case for diarrhea_* cause_id = get_cause_id(CAUSES_NOT_IN_GBD_MAP[acause]) else: cause_id = get_cause_id(acause) gbd_round = get_gbd_round(gbd_round_id) try: # we only need it for year_id=gbd_round, but for every other dim # we collect everything. paf_df = get_draws(gbd_id_type=['cause_id', 'rei_id'], gbd_id=[cause_id, rei_id], source='burdenator', year_id=gbd_round, gbd_round_id=gbd_round_id, measure_id=measure_id, metric_id=metric_id) except Exception as exc: error_message = "Error in get_draws for {}_{}".format(acause, rei) LOGGER.error(error_message) raise IOError(str(exc)) paf_df = paf_df.drop(columns=["year_id", "rei_id", "cause_id", "measure_id", "metric_id"]) # don't need these no more paf_da = df_to_xr(paf_df, dims=["location_id", "age_group_id", "sex_id"], wide_dim_name='draw', wide_dim_transform=lambda x: int(x.split('_')[1]), fill_value=np.nan) paf_da = paf_da.sortby("draw") # draws don't always come in sorted paf_da = _data_cleaning_for_paf(paf_da, acause, rei, "GBD") LOGGER.info("Saving downloaded & cleaned {}". format(cache_file_fbdpath)) save_xr(paf_da, cache_file_fbdpath, metric="percent", space="identity", cause_id=cause_id, rei_id=rei_id, gbd_round_id=gbd_round_id, year_id=gbd_round, measure_id=measure_id, metric_id=metric_id, upper_bound=PAF_UPPER_BOUND, lower_bound=PAF_LOWER_BOUND) if len(paf_da["draw"]) != draws: paf_da = resample(paf_da, draws) return paf_da