def read_lifetable(gbd_round_id, lifetable_version, draws): """ Args: gbd_round_id (int): lifetable_version (str): Of the form "past/versionname" or "future/versionname" draws (int): desired number of draws Returns: The life table. """ if "/" in lifetable_version: past_or_future, version = lifetable_version.split("/") else: past_or_future = "future" version = lifetable_version # lifetable from the future includes last year of the past. lifetable_file = FBDPath("/{}/{}/life_expectancy/{}".format( gbd_round_id, past_or_future, version)) / "lifetable_ds.nc" try: LOGGER.info("Reading {}".format(lifetable_file)) lifetable = xr.open_dataset(str(lifetable_file)) except OSError as ose: LOGGER.error("Cannot open lifetable {}: {}".format( lifetable_file, ose)) exit() if "draw" in lifetable.dims: lifetable = resample(lifetable.sortby("draw"), draws) return _drop_point_coordinates(lifetable)
def load_forecast_pop(gbd_round_id, version, years, draws): """ Load forecast population data. Aggregates if necessary. Args: gbd_round_id (int): The gbd round ID that the past population is from version (str): The version of forecast population to read from years (YearRange): The Forecasting format years to use. Returns: xarray.DataArray: The past population xarray dataarray """ forecast_pop_dir = FBDPath(f"/{gbd_round_id}/future/population/{version}") try: forecast_pop_path = forecast_pop_dir / "population_agg.nc" forecast_pop_da = open_xr(forecast_pop_path).data except: # Need to make agg version forecast_pop_path = forecast_pop_dir / "population.nc" forecast_pop_da = open_xr(forecast_pop_path).data forecast_pop_da = Aggregator.aggregate_everything( forecast_pop_da, gbd_round_id).pop forecast_pop_out_path = forecast_pop_dir / "population_agg.nc" save_xr(forecast_pop_da, forecast_pop_out_path, metric="number", space="identity") # slice to correct years and number of draws forecast_pop_da = forecast_pop_da.sel(year_id=years.forecast_years) forecast_pop_da = resample(forecast_pop_da, draws) return forecast_pop_da
def concat_past_future(past_da, forecast_da, draws, years): """Combine past at last past year and future data. Args: past_da (xarray.DataArray): Past data. forecast_da (xarray.DataArray): Forecast data. gbd_round_id (int): The GBD round fed into FBDPath. draws (int): Number of draws. Returns: (xarray.DataArray): Combined past and future data. """ past_da = past_da.sel(year_id=years.past_end, age_group_id=forecast_da.age_group_id.values, location_id=forecast_da.location_id.values) forecast_da = forecast_da.sel(year_id=years.forecast_years, scenario=0).drop("scenario") forecast_da = resample(forecast_da, draws) past_da = resample(past_da, draws) combined_da = xr.concat([past_da, forecast_da], dim="year_id") return combined_da
def read_sev(rei, sev, vaccine_sev, gbd_round_id, years, draws): """ Reads in SEV for vaccine. Args: rei (str): risk, could also be vaccine intervention. gbd_round_id (int): gbd round id sev (str): upstrem sev version vaccine_sev (str): upstream vaccine sev version. gbd_round_id (int): gbd round id. years (YearRange): [past_start, forecast_start, forecast_end] years. draws (int): number of draws for output file. This means input files will be up/down-sampled to meet this criterion. Returns: (xr.DataArray): SEV in dataarray form. """ if rei in get_vaccine_reis(gbd_round_id): # vaccine treated as anti-risk infile_fbd_path =\ FBDPath(gbd_round_id=gbd_round_id, past_or_future="future", stage="vaccine", version=vaccine_sev) / (rei + "_new_ref.nc") out = 1.0 - open_xr(infile_fbd_path).data # anti-risk else: infile_fbd_path =\ FBDPath(gbd_round_id=gbd_round_id, past_or_future="future", stage="sev", version=sev) / (rei + ".nc") out = open_xr(infile_fbd_path).data out = conditionally_triggered_transformations(out, gbd_round_id, years) if len(out["draw"]) != draws: out = resample(out, draws) return out
def get_gbd_paf(acause, rei, cache_version, gbd_round_id, sex_ids, location_ids, draws, measure_id=4, metric_id=2): """ Downloads and transforms gbd cause-risk-specific PAF. The dataarray is then cleaned and saved in a FBDPath. The gbd paf coming from get_draws:: >>> df.columns Index([u'rei_id', u'modelable_entity_id', u'location_id', u'year_id', u'age_group_id', u'sex_id', u'cause_id', u'measure_id', u'draw_0', u'draw_1', ... u'draw_991', u'draw_992', u'draw_993', u'draw_994', u'draw_995', u'draw_996', u'draw_997', u'draw_998', u'draw_999', u'metric_id'], dtype='object', length=1009) where we will need to 1.) use cause_id to slice for the cause-risk pair 2.) use measure_id (typically 4 for yll) to slice for measure_id 3.) use metric_id (typically 2 for percent) to slice for metric_id Args: acause (str): analytical cause. rei (str): risk, could also be vaccine intervention. cache_version (str): the FBDPath paf version to save the gbd paf in, or to read from. gbd_round_id (int): gbd round id sex_ids (list): sexes. Typically [1, 2]. location_ids (list): locations to get pafs from. draws (int): number of draws for output file. This means input files will be up/down-sampled to meet this criterion. measure_id (int, optional): typically the yll measure id (4). At the most detailed PAF yll is equivalent to death, so measure_id 4 works the same as measure_id 1 (death). Empirically, it seems to pull data faster if calling with meausre_id=4. metric_id (int, optional): typically the percent metric (2) Returns: (xr.DataArray/None): Dataarray with complete demographic indices, sans "scenario". """ if rei in get_vaccine_reis(gbd_round_id): # get_draws won't have anything for vaccines return None cache_file_fbdpath =\ FBDPath(gbd_round_id=gbd_round_id, past_or_future="past", stage="paf", version=cache_version) / (acause + "_" + rei + ".nc") if cache_file_fbdpath.exists(): LOGGER.info("{} already exists. Will read from it for gbd paf.". format(cache_file_fbdpath)) paf_da = open_xr(cache_file_fbdpath).data paf_da = paf_da.sel(location_id=location_ids) if len(paf_da["draw"]) != draws: paf_da = resample(paf_da, draws) return paf_da else: # no cache exists, must download & clean rei_id = get_rei_id(rei) if acause in CAUSES_NOT_IN_GBD_MAP: # edge case for diarrhea_* cause_id = get_cause_id(CAUSES_NOT_IN_GBD_MAP[acause]) else: cause_id = get_cause_id(acause) gbd_round = get_gbd_round(gbd_round_id) try: # we only need it for year_id=gbd_round, but for every other dim # we collect everything. paf_df = get_draws(gbd_id_type=['cause_id', 'rei_id'], gbd_id=[cause_id, rei_id], source='burdenator', year_id=gbd_round, gbd_round_id=gbd_round_id, measure_id=measure_id, metric_id=metric_id) except Exception as exc: error_message = "Error in get_draws for {}_{}".format(acause, rei) LOGGER.error(error_message) raise IOError(str(exc)) paf_df = paf_df.drop(columns=["year_id", "rei_id", "cause_id", "measure_id", "metric_id"]) # don't need these no more paf_da = df_to_xr(paf_df, dims=["location_id", "age_group_id", "sex_id"], wide_dim_name='draw', wide_dim_transform=lambda x: int(x.split('_')[1]), fill_value=np.nan) paf_da = paf_da.sortby("draw") # draws don't always come in sorted paf_da = _data_cleaning_for_paf(paf_da, acause, rei, "GBD") LOGGER.info("Saving downloaded & cleaned {}". format(cache_file_fbdpath)) save_xr(paf_da, cache_file_fbdpath, metric="percent", space="identity", cause_id=cause_id, rei_id=rei_id, gbd_round_id=gbd_round_id, year_id=gbd_round, measure_id=measure_id, metric_id=metric_id, upper_bound=PAF_UPPER_BOUND, lower_bound=PAF_LOWER_BOUND) if len(paf_da["draw"]) != draws: paf_da = resample(paf_da, draws) return paf_da
def read_rrmax(acause, rei, rrmax, vaccine_rrmax, gbd_round_id, years, draws): """ Reads in RRmax for vaccine. Args: acause (str): analytical cause. rei (str): risk, could also be vaccine intervention. gbd_round_id (int): gbd round id rrmax (str): upstream rrmax version vaccine_rrmax (str): upstream vaccine rrmax version. gbd_round_id (int): gbd round id. years (YearRange): [past_start, forecast_start, forecast_end] years. draws (int): number of draws for output file. This means input files will be up/down-sampled to meet this criterion. Returns: (xr.DataArray): vaccine RRmax in dataarray form. """ if rei in get_vaccine_reis(gbd_round_id): # The values stored in these data files are actually not RR, but rather # r = Incidence[infection | vax] / Incidence[infection | no vax], # interpreted as "percent reduction of diseased cases if vaccinated", # and should be r < 1. # We compute the actual RR as 1/r. infile_fbd_path =\ FBDPath(gbd_round_id=gbd_round_id, past_or_future="future", stage="rrmax", version=vaccine_rrmax) / (rei + ".nc") else: infile_fbd_path =\ FBDPath(gbd_round_id=gbd_round_id, past_or_future="past", stage="rrmax", version=rrmax) / "netcdf" / (rei + ".nc") cause_id = get_cause_id(acause) out = open_xr(infile_fbd_path).data if cause_id not in out[CAUSE_DIM].values.tolist(): error_message = "{} ({}) not in {}'s cause dim: {}".\ format(acause, cause_id, infile_fbd_path, out[CAUSE_DIM].values.tolist()) LOGGER.error(error_message) raise KeyError(error_message) out = out.loc[{CAUSE_DIM: cause_id}].drop(CAUSE_DIM) out = conditionally_triggered_transformations(out, gbd_round_id, years) if rei in get_vaccine_reis(gbd_round_id): # NOTE if we switch raw data source to burdenator, this algo might # need to change. # As mentioned above, this value for vaccine should be < 1. # Any value > 1 should be capped. out = out.where(out <= PAF_UPPER_BOUND).fillna(PAF_UPPER_BOUND) out = 1.0 / out # as mentioned earlier, we compute RR as 1/r. if len(out["draw"]) != draws: out = resample(out, draws) # NOTE some rrmax cell values could be 0, for reasons unclear. return out
def read_datasets(asfr_version, gbd_round_id, lifetable_version, pop_version, migration_version, years, srb_version, draws): """ This reads files, orders their axes, and ensures that data arrays aren't presented as datasets. This enforces rules about how many files get read, how they are found, and how they are assembled into the incoming data. It doesn't address what the data means. Args: asfr_version (str): Version string for ASFR gbd_round_id (int): GBD Round as an integer lifetable_version (list[str]): Lifetable version pop_version (str): Population start version migration_version (list[str]): Migration version years (YearRange): years for past and forecast srb_version (str): sex ratio at birth version draws (int): the number of draws to take from the future versions. Returns: xr.DataArray: ASFR tuple: Either one lifetable file or (past, futue). xr.DataArray: Starting population xr.DataArray: Migration xr.DataArray: SRB """ # Do this in a subroutine so it's memory can be released. # pop etl (pop version is in the past) data_read_start = perf_time() pop_file = FBDPath("/{}/past/population/{}".format( gbd_round_id, pop_version)) / "population.nc" try: LOGGER.info("Reading {}".format(pop_file)) pop = xr.open_dataarray(str(pop_file)) # if there's a draw dimension, take the mean if "draw" in pop.dims: pop = pop.mean("draw") except OSError as ose: LOGGER.error("Cannot open pop {}: {}".format(pop_file, ose)) exit() # we may or may not have draws for past pops, but we should certainly # expect location, age, sex, and year assert {"location_id", "year_id", "age_group_id", "sex_id"}.issubset(set(pop.dims)) if len(pop.year_id) > 1: pop = pop.loc[{"year_id": years.past_end}] else: pop = pop.squeeze(dim="year_id") assert pop.year_id == years.past_end LOGGER.debug("pop {}".format(pop)) # we like age_group_id to be the last dim to expedite later computation. if "draw" in pop.dims: # if past pop has draws, resample. pop = pop.transpose("draw", "location_id", "sex_id", "age_group_id") pop = resample(pop, draws) else: pop = pop.transpose("location_id", "sex_id", "age_group_id") if pop.name is None: pop.name = "population" # asfr etl (draws expected) asfr_gbd_round_id = gbd_round_id if gbd_round_id >= 5 else 5 asfr_file = FBDPath("/{}/future/asfr/{}".format(asfr_gbd_round_id, asfr_version)) / "asfr.nc" try: LOGGER.info("Reading {}".format(asfr_file)) # ASFR is reported per thousand people. asfr = xr.open_dataarray(str(asfr_file)) except OSError as ose: LOGGER.error("Cannot open asfr {}: {}".format(asfr_file, ose)) #exit(2 assert set(asfr.dims) == { "draw", "year_id", "location_id", "scenario", "age_group_id" }, "asfr dims {}".format(asfr.dims) asfr_lim = asfr.sel(year_id=slice(years.past_end, years.forecast_end + 1)) if asfr_lim.name is None: asfr_lim.name = "asfr" asfr_lim = resample(asfr_lim, draws) # lifetable etl (draws expected) lifetables = list() for lfilename in lifetable_version: lifetables.append(read_lifetable(gbd_round_id, lfilename, draws)) if len(lifetables) > 1: lpast, lfuture = (None, None) lyears = [llx.year_id.values for llx in lifetables] if lyears[0][-1] > lyears[1][-1]: lfuture, lpast = lifetables elif lyears[1][-1] > lyears[0][-1]: lpast, lfuture = lifetables elif lyears[0][0] < lyears[1][0]: lpast, lfuture = lifetables elif lyears[1][0] < lyears[0][0]: lfuture, lpast = lifetables else: LOGGER.error("Cannot figure out which is the future lifetable") exit() if years.past_end in lfuture.year_id.values: LOGGER.info("All needed years were in the future lifetable" "Ignoring the past data.") lifetable_lim = lfuture.sel( year_id=slice(years.past_end, years.forecast_end + 1)) lifetable_out = (lifetable_lim, ) else: assert years.past_end in lpast.year_id.values past_slice = lpast.loc[{"year_id": [years.past_end]}] LOGGER.debug("Life past slice {}".format( past_slice.year_id.values)) LOGGER.debug("Life future slice {}".format(lfuture.year_id.values)) lifetable_out = (past_slice, lfuture) else: lifetable_lim = lifetables[0].sel( year_id=slice(years.past_end, years.forecast_end + 1)) lifetable_out = (lifetable_lim, ) # migration etl (no draws expected) try: migration_file = FBDPath("/{}/future/migration/{}".format( gbd_round_id, migration_version[0])) / "migration.nc" except Exception: if os.path.exists(migration_version[0]): migration_file = migration_version[0] else: raise Exception("Cannot construct {}".format(migration_file)) try: LOGGER.info("Reading {}".format(migration_file)) migration = xr.open_dataarray(str(migration_file)) except OSError as ose: LOGGER.error("Cannot open migration {}: {}".format( migration_file, ose)) exit() assert set(("location_id", "age_group_id", "sex_id", "year_id")).\ issubset(migration.dims) # Currently we don't use or make migration scenarios -- if a scenario dim # exists for some reason ensure that only reference is used and that the # scenario dim is dropped. if "scenario" in migration.dims: # scenario dim migration = migration.sel(scenario=0, drop=True) elif "scenario" in migration.coords: # scenario point coord migration = migration.drop("scenario") else: pass # no scenario dim or point coord # if pop has draws, we want migration to have draws as well. # this becomes important in _fill_missing_locations(). if "draw" in pop.dims: if "draw" not in migration.dims: migration = expand_dimensions(migration, draw=pop["draw"]) else: migration = resample(migration, draws) migration = migration.transpose("draw", "location_id", "year_id", "sex_id", "age_group_id") else: # pop has no "draw", so migration doesn't need it either if "draw" in migration.dims: migration = migration.mean("draw") migration = migration.transpose("location_id", "year_id", "sex_id", "age_group_id") if migration.name is None: migration.name = "migration" # Use the last past year's all age population proportions to compute # regional migration averages to fill in missing data. migration_locs_fixed = _clean_migration_locations(migration, pop.sum("age_group_id"), gbd_round_id) LOGGER.info("Read data Elapsed {}".format(perf_time() - data_read_start)) # Migration counts drive small nations to zero population. # This is a way to ensure we show the trend of health. migration_locs_fixed.loc[dict( location_id=list(SMALL_NATIONS_ZERO_MIGRATION.values()))] = 0. LOGGER.debug("Pop from read years {}".format(pop.year_id.values)) # Not FBDPath at the moment since it doesn't recognize covariate as a # valid stage. May need to change location of files. # srb etl (no draws) srp_path = FBDPath("/{}/past/sex_ratio_at_birth/{}".format( gbd_round_id, srb_version)) srb_file = srp_path / "sex_ratio_at_birth.nc" try: LOGGER.info("Reading {}".format(srb_file)) srb = xr.open_dataarray(str(srb_file)) except OSError as ose: LOGGER.error("Cannot open srb {}: {}".format(srb_file, ose)) exit() # Subset to last year of past srb = srb.sel(year_id=years.past_end) return asfr_lim, lifetable_out, pop, migration_locs_fixed, srb
def forecast_edu_main(transform, past_version, forecast_version, pv_version, weight_strategy, gbd_round_id, years, reference_scenario, diff_over_mean, truncate, truncate_quantiles, replace_with_mean, draws, **kwargs): LOGGER.debug("weight strategy: {}".format(weight_strategy.__name__)) pv_path = FBDPath("".format()) # Path removed for security reasons rmse = open_xr(pv_path / "education_arc_weight_rmse.nc").data weight_exp = weight_strategy(rmse, draws) LOGGER.info("omega selected: {}".format(weight_exp)) LOGGER.debug("Reading in the past") past_path = FBDPath("".format()) # Path removed for security reasons past = resample(open_xr(past_path / "education.nc").data, draws) past = past.sel(year_id=years.past_years) if isinstance(weight_exp, float) or isinstance(weight_exp, int): extra_dim = None else: if not isinstance(weight_exp, xr.DataArray): omega_exp_err_msg = ( "`omega` must be either a float, an int, or an " "xarray.DataArray") LOGGER.error(omega_exp_err_msg) raise RuntimeError(omega_exp_err_msg) elif len(weight_exp.dims) != 1 or "draw" not in weight_exp.dims: omega_exp_err_msg = ( "If `omega` is a xarray.DataArray, then it must have only " "1 dim, `draw`") LOGGER.error(omega_exp_err_msg) raise RuntimeError(omega_exp_err_msg) elif not weight_exp["draw"].equals(past["draw"]): omega_err_msg = ( "If `omega` is a xarray.DataArray, then it's `draw` dim " "must have the coordinates as `past`") LOGGER.error(omega_err_msg) raise RuntimeError(omega_err_msg) else: extra_dim = "draw" forecast = arc_forecast_education(past, gbd_round_id, transform, weight_exp, years, reference_scenario, diff_over_mean, truncate, truncate_quantiles, replace_with_mean, extra_dim=extra_dim) forecast_path = FBDPath("".format()) if isinstance(weight_exp, xr.DataArray): report_omega = float(weight_exp.mean()) else: report_omega = weight_exp save_xr(forecast, forecast_path / "education.nc", metric="number", space="identity", omega=report_omega, omega_strategy=weight_strategy.__name__) LOGGER.info("education forecasts have saved")