def save_y_star(eps_version, arima_version, years, measure, draws, decay, gbd_round_id): """ apply random walk and save the output """ ds = open_xr(eps_path).data try: eps_preds = open_xr(f"{mig_dir}/eps_star.nc").data except Exception: eps_preds = arima_migration(ds, years, draws, decay) epsilon_hat_out = mig_dir / "eps_star.nc" save_xr(eps_preds, epsilon_hat_out, metric="rate", space="identity") # cap residuals between 10 and -10 # the population forecasts to 2100 is decreasing to 0 with current # forecasts from migration for Syria, Latvia and Jamaica, the capping # method helps to make things more reasonable eps_past = eps_preds.sel(year_id=years.past_years) eps_preds = eps_preds.sel(year_id=years.forecast_years) eps_preds = eps_preds.clip(min=-10, max=10) eps_preds = xr.concat([eps_past, eps_preds], dim="year_id") pred_path = mig_dir / "mig_hat.nc" preds = open_xr(pred_path).data preds = preds.sel(year_id=years.years) preds = expand_dimensions(preds, draw=range(0, draws)) y_star = preds + eps_preds save_xr(y_star, ystar_out, metric="rate", space="identity")
def _expand_5yr_age_groups_to_1yr_ages(da, ages_df): """ Converts 5-year age groups to 1-year ages, by simply repeating the same value. Args: da (xr.DataArray): da with "age_group_id" dim. ages_df (pd.DataFrame): df with age group metadata. Returns: (xr.DataArray) da where "age_group_id" dim is replaced with "age". """ assert "age_group_id" in da.dims, "Missing age_group_id dim" for col in [ "age_group_id", "age_group_years_start", "age_group_years_end" ]: assert col in ages_df.columns, f"Missing {col} column" assert da["age_group_id"].isin(ages_df["age_group_id"]).all(),\ "Not all age group ids are available in metadata" das = [] for age_group_id in da["age_group_id"].values: lower_age = int( ages_df.query("age_group_id == @age_group_id") ["age_group_years_start"]) upper_age = int( ages_df.query("age_group_id == @age_group_id") ["age_group_years_end"]) sub_da = da.sel(age_group_id=age_group_id).drop("age_group_id") sub_da = expand_dimensions(sub_da, age=range(int(lower_age), int(upper_age))) das.append(sub_da) return xr.concat(das, dim="age")
def predict(self): """ Generate predictions based on model fit. """ locations = self.dataset[self.y].location_id.values ages = self.dataset[self.y].age_group_id.values sexs = self.dataset[self.y].sex_id.values location_data_list = [] for location_id in locations: age_data_list = [] for age_group_id in ages: sex_data_list = [] for sex_id in sexs: forecast = self.predict_single_ts(location_id, age_group_id, sex_id) sex_data_list.append(forecast) age_data_list.append(xr.concat(sex_data_list, dim="sex_id")) location_data_list.append( xr.concat(age_data_list, dim="age_group_id")) all_preds = xr.concat(location_data_list, dim="location_id") past = self.dataset.y try: past = past.drop(["acause", "scenario"]) except ValueError: pass past = expand_dimensions(past, draw=range(0, self.draws)) all_preds = xr.concat([past, all_preds], dim="year_id") return all_preds
def melt_to_xarray(df): """Melts GBD data with 'mean', 'lower', and 'upper' columns to a single 'quantile' column; converts to xarray dataarray; and adds a scenario dimension. Args: df (pandas dataframe): Dataframe with 'year_id', 'location_id', 'mean', 'lower', and 'upper' columns. Returns: da_with_scenario (xarray dataarray): Dataarray with 'year_id', 'quantile', 'location_id', and 'scenario' dimensions. """ df_long = pd.melt(df, id_vars=["year_id", "location_id"], value_vars=["mean", "lower", "upper"], var_name="quantile") da = df_long.set_index( ["year_id", "quantile", "location_id"]).to_xarray()["value"] da_with_scenario = expand_dimensions(da, scenario=[0]) return da_with_scenario
def _qx_to_lx(qx): r""" Computes :math:`l_x` based on :math:`q_x`, where :math:`q_x` already contains the 95-100 (33) and 100-105 (44) age groups. Also computes :math:`l_x` for 105-110 (45), and then set :math:`l_x` for 110+ to be 0. Args: qx (xr.DataArray): Probability of dying. Returns: (xr.DataArray): lx. """ if tuple(qx["age_group_id"].values[-2:]) != (33, 44): raise ValueError("qx must have age group ids 33 and 44") px = 1.0 - qx # now we have survival all the way to 100-105 (44) age group # Because l{x+n} = lx * px, we can compute all lx's if we start with # l_0 = 1 and iteratively apply the px's of higher age groups. # So we compute l_105-110, since we have p_100-105 from extrapolated qx. # We start with a set of lx's that are all 1.0 lx = xr.full_like(px, 1) # now expand lx to have age groups 105-110 (45) lx = expand_dimensions(lx, fill_value=1, age_group_id=[45]) # Since l{x+n} = lx * px, we make cumulative prduct of px down age groups # and apply the product to ages[1:] (since ages[0]) has lx = 1.0 ages = lx["age_group_id"] ppx = px.cumprod(dim="age_group_id") # the cumulative product of px ppx.coords["age_group_id"] = ages[1:] # need to correspond to ages[1:] lx.loc[dict(age_group_id=ages[1:])] *= ppx # lx all the way to 100-105 # now artificially sets lx to be 0 for the 110+ age group. lx = expand_dimensions(lx, fill_value=0, age_group_id=[148]) assert (lx.sel(age_group_id=2) == 1).all() assert tuple(lx['age_group_id'].values[-4:]) == (33, 44, 45, 148),\ "final lx should have age group ids 33, 44, 45, and 148." return lx
def load_pop(gbd_round_id, past_version, forecast_version): forecast_file = FBDPath( f"/{gbd_round_id}/future/population/" f"{forecast_version}/population_combined.nc") past_file = FBDPath( f"/{gbd_round_id}/past/population/{past_version}/population.nc") future_pop = open_xr(forecast_file).data past_pop = expand_dimensions(open_xr(past_file).data, scenario=future_pop.scenario, quantile = future_pop["quantile"]) pop = xr.concat([past_pop, future_pop], "year_id") return pop
def prep_pop_da(past_version, forecast_version, gbd_round_id, years): forecast_pop_file = FBDPath( f"/{gbd_round_id}/future/population/{forecast_version}/" f"population_combined.nc") forecast_fhs = open_xr(forecast_pop_file).data.sel(quantile='mean', drop=True) past_fhs_file = FBDPath( f"/{gbd_round_id}/past/population/{past_version}/population.nc") past_fhs = expand_dimensions(open_xr(past_fhs_file).data.sel( year_id=years.past_years, sex_id=forecast_fhs["sex_id"], age_group_id=forecast_fhs["age_group_id"], location_id=forecast_fhs["location_id"]), scenario=forecast_fhs.scenario.values) fhs_all_scenarios = xr.concat([past_fhs, forecast_fhs], dim="year_id") fhs = fhs_all_scenarios.sel(scenario=[-1, 0, 1]) alt_sdg = fhs_all_scenarios.sel(scenario=[3]) alt_99 = fhs_all_scenarios.sel(scenario=[2]) ages = db.get_ages().query("age_group_id in @ALL_AGE_GROUP_IDS") days = ages[["age_group_id", "age_group_days_start", "age_group_days_end"]] days["mean_age"] = (days["age_group_days_end"] - (days["age_group_days_end"] - days["age_group_days_start"]) / 2) / 365.25 mean_age = days.set_index("age_group_id")["mean_age"].to_xarray() data_fhs = fhs.sel(age_group_id=mean_age["age_group_id"], sex_id=SEX_IDS) data_sdg = alt_sdg.sel(age_group_id=mean_age["age_group_id"], sex_id=SEX_IDS) data_99 = alt_99.sel(age_group_id=mean_age["age_group_id"], sex_id=SEX_IDS) avg_age_fhs = (data_fhs * mean_age).sum("age_group_id") / data_fhs.sum("age_group_id") avg_age_sdg = (data_sdg * mean_age).sum("age_group_id") / data_sdg.sum("age_group_id") avg_age_99 = (data_99 * mean_age).sum("age_group_id") / data_99.sum("age_group_id") ds = data_fhs.rename("population").to_dataset() ds_sdg = data_sdg.rename("population").to_dataset() ds_99 = data_99.rename("population").to_dataset() return avg_age_fhs, avg_age_sdg, avg_age_99, ds, ds_sdg, ds_99
def get_pop(forecast_pop_version, gbd_round_id, measure, draws, years, past_pop_version): """Pulls specified version of populations, subsets to fertile age groups and females only if meausre is live_births. Args: gbd_round_id (int): The GBD round fed into FBDPath to pull the correct version of pops forecast_pop_version (str): The version name of the populations file used in FBDPath. draws (int): The number of desired draws. This goes into resample, so we get pops with the correct number of draws. Returns: (xarray.DataArray): Fertile forecast population. The ``age_group_id`` dimension includes coordinates for each of the fertile age-groups. """ forecast_pop_path = FBDPath( f"{gbd_round_id}/future/population/{forecast_pop_version}") forecast_pop_file = forecast_pop_path / "population.nc" forecast_pop = open_xr(forecast_pop_file).data past_pop_path = FBDPath( f"{gbd_round_id}/past/population/{past_pop_version}") past_pop_file = past_pop_path / "population.nc" past_pop = open_xr(past_pop_file).data past_pop = past_pop.sel(sex_id=forecast_pop.sex_id.values) past_pop = expand_dimensions(past_pop, draw=range(draws)) forecast_pop = concat_past_future(past_pop, forecast_pop, draws, years) if measure == "live_births": forecast_pop = forecast_pop.sel( age_group_id=list(FERTILE_AGE_GROUP_IDS), sex_id=2).drop(["sex_id"]) else: forecast_pop = forecast_pop.sel(sex_id=[1, 2]) return forecast_pop
def get_maternal_edu(education, gbd_round_id, past_future, pop_version, location_ids): """Recalculate maternal education, which according to the education team is the education of women of age-group-IDs 8 to 14 multiplied by their age-weights and then summed over age. Only the age weights of groups 8 to 14 are kept, and then are rescaled so that the sum of those age weights is 1. Args: education (xarray.DataArray): Education data. Needs dimensions `age_group_id` and `sex_id`, but probably also has dimensions `location_id`, `draw`, `year_id` and maybe `scenario`. gbd_round_id (int): Numeric ID for the GBD round. Used to get the age-weights for the round from the database. past_pop_version (str): Version of past population to use for maternal education aggregation. future_pop_version (str): Version of future population to use for maternal education aggregation. Returns: (tuple[xarray.DataArray, xarray.DataArray]): * The first `xarray.DataArray` of the tuple is educational attainment for all age-groups and sexes. However, children that are too young to have their own education are filled in with maternal education. * The second `xarray.DataArray` of the tuple is maternal education -- only for the maternal age-group, given by `MAT_AGE_GROUP_ID` and females, given by `FEMALE_SEX_ID`. """ pop_path = FBDPath("") # Path removed for security reasons pop = open_xr(pop_path / "population.nc").data.sel( age_group_id=list(MAT_AGE_GROUPS), sex_id=FEMALE_SEX_ID, location_id=list(location_ids) ) LOGGER.debug("Adding up education of moms to get maternal education.") mat_slice_edu = education.sel(sex_id=FEMALE_SEX_ID, age_group_id=list(MAT_AGE_GROUPS), location_id=list(location_ids)) agg = Aggregator(pop) mat_edu = agg.aggregate_ages(list(MAT_AGE_GROUPS), MAT_AGE_GROUP_ID, data=mat_slice_edu).rate # age_group_id must be dropped. If not, expand_dimensions will broadcast # NaNs instead of our data into the new child age_group_id values. mat_edu_expanded = expand_dimensions(mat_edu.drop("age_group_id").squeeze(), sex_id=list(SEXES), age_group_id=list(CHILD_AGE_GROUPS)) LOGGER.debug("Adding maternal education for both sexes and child age " "groups to education data array.") # Even if ``education`` has data for child age groups, combine first will # make sure that the newly calculated maternal education will be used # instead. return mat_edu_expanded.combine_first(education), mat_edu
def main(asfr_version, past_asfr_version, location_id, gbd_round_id, years, granularity, iterations, **kwargs): """ 1. Read in location-specific draws of period ASFR from CCF stage 2. Add terminal age group ASFR's 3. Intercept shift asfr by holding CCF50 constant. 4. Export location-specific intercept-shifted ASFR in .nc Args: asfr_version (str): version name of future ccf/asfr. past_asfr_version (str): asfr version from past. location_id (int): location_id. gbd_round_id (int): gbd round id. years (YearRange): past_start:forecast_start:forecast_end iterations (int): number of times to intercept-shift. """ ages_df = db.get_ages(gbd_round_id)[[ "age_group_id", "age_group_years_start", "age_group_years_end" ]] # read the location-specific asfr .csv into dataarray # the raw forecasted ASFR are stored in the CCF stage of the same ccf_fbd_path = FBDPath(gbd_round_id=gbd_round_id, past_or_future="future", stage="ccf", version=asfr_version) if granularity == 1: sub_folder = "asfr_single_year" ccf_asfr_fbd_path = ccf_fbd_path / sub_folder future_asfr = read_to_xr(location_id, ccf_asfr_fbd_path, dims=list(ASFR_NON_AGE_DIMS + ("age", ))) else: sub_folder = "asfr" ccf_asfr_fbd_path = ccf_fbd_path / sub_folder future_asfr =\ read_to_xr(location_id, ccf_asfr_fbd_path, dims=list(ASFR_NON_AGE_DIMS + ("age_group_id",))) # we intercept-shift in 1-year ages, so convert to single years future_asfr = _expand_5yr_age_groups_to_1yr_ages(future_asfr, ages_df) if "sex_id" in future_asfr.dims: raise ValueError("Found sex_id dim in future asfr") # now etl the past asfr data past_asfr_fbd_path = FBDPath(gbd_round_id=gbd_round_id, past_or_future="past", stage="asfr", version=past_asfr_version) past_asfr =\ open_xr(past_asfr_fbd_path / "asfr.nc").data.sel(location_id=location_id) if "sex_id" in past_asfr.dims: raise ValueError("Found sex_id dim in past asfr") # past has no scenarios, so we need to expand it for merging past_asfr = expand_dimensions(past_asfr, scenario=future_asfr["scenario"]) # past asfr has age group ids 7-15, but future asfr in ccf only has 8-14. # we only need age groups 8-14 for intercept shift past_asfr_1yrs = _expand_5yr_age_groups_to_1yr_ages( past_asfr.sel(age_group_id=range(8, 15)), ages_df) # now ready to concat past and future together for intercept shift asfr = xr.concat([ past_asfr_1yrs.sel(year_id=years.past_years), future_asfr.sel(year_id=years.forecast_years) ], dim="year_id") del past_asfr_1yrs, future_asfr gc.collect() # the intercept-shift should keep ccf50 (asfr sum) constant pre_fix_asfr_sum = asfr.sum() # sum of all asfr values before shift asfr = ccf50_intercept_shift_lpf(asfr, gbd_round_id, years, iterations) post_fix_asfr_sum = asfr.sum() # asfr sum post-shift should stay the same assert np.isclose(post_fix_asfr_sum, pre_fix_asfr_sum, rtol=RTOL),\ f"The intercept shift changed total asfr sum by more than rtol={RTOL}" # need to save years.past_end for cohort-component model save_years = [years.past_end] + years.forecast_years.tolist() asfr = asfr.sel(year_id=save_years) # only do forecast # convert forecasted asfr back to 5-year age groups asfr = _convert_ages_to_5_year_age_groups_by_mean(asfr, ages_df) # add 10-15 (7) and 50-55 (15) age groups for forecasted asfr asfr = extrapolate_terminal_asfr_age_groups(past_asfr, asfr, last_year=years.past_end) asfr["location_id"] = location_id asfr.name = "value" del past_asfr gc.collect() LOGGER.info("Finished CCF50 intercept-shift") asfr_fbd_path = FBDPath(gbd_round_id=gbd_round_id, past_or_future="future", stage="asfr", version=asfr_version) save_xr(asfr, asfr_fbd_path / f"{location_id}.nc", metric="rate", space="identity", version=asfr_version, past_asfr_version=past_asfr_version, iterations=iterations)
lex_fut = open_xr(lex_fut_path / "lifetable_ds_agg.nc").data["ex"].sel( year_id=range(2018, 2101), sex_id=3, age_group_id=2, scenario=0) lex = lex_past.combine_first(lex_fut).drop( ["sex_id", "age_group_id", "scenario"]).squeeze() lex_mean_ui = lex.rename("value").to_dataset() compute_summaries(lex_mean_ui) lex_mean_ui = lex_mean_ui[["mean", "upper", "lower"]].drop("quantile").squeeze() lex_mean = lex_mean_ui["mean"] pop_past = open_xr(pop_past_path / "population_agg.nc").data.sel( year_id=range(1990, 2018), sex_id=3, age_group_id=22) pop_past = expand_dimensions(pop_past, quantile=["mean", "lower", "upper"]) pop_fut = open_xr(pop_fut_path / "population_combined.nc").data.sel( year_id=range(2018, 2101), sex_id=3, age_group_id=22, scenario=0) pop = pop_past.combine_first(pop_fut).sel( location_id=NATS.location_id.tolist()) def get_rate_of_change(ex, year_start, year_end): # Get ARC lex_start = ex.sel(year_id=year_start).drop("year_id").squeeze() lex_end = ex.sel(year_id=year_end).drop("year_id").squeeze() rate_change = (lex_start - lex_end) / (year_end - year_start) return rate_change
def read_datasets(asfr_version, gbd_round_id, lifetable_version, pop_version, migration_version, years, srb_version, draws): """ This reads files, orders their axes, and ensures that data arrays aren't presented as datasets. This enforces rules about how many files get read, how they are found, and how they are assembled into the incoming data. It doesn't address what the data means. Args: asfr_version (str): Version string for ASFR gbd_round_id (int): GBD Round as an integer lifetable_version (list[str]): Lifetable version pop_version (str): Population start version migration_version (list[str]): Migration version years (YearRange): years for past and forecast srb_version (str): sex ratio at birth version draws (int): the number of draws to take from the future versions. Returns: xr.DataArray: ASFR tuple: Either one lifetable file or (past, futue). xr.DataArray: Starting population xr.DataArray: Migration xr.DataArray: SRB """ # Do this in a subroutine so it's memory can be released. # pop etl (pop version is in the past) data_read_start = perf_time() pop_file = FBDPath("/{}/past/population/{}".format( gbd_round_id, pop_version)) / "population.nc" try: LOGGER.info("Reading {}".format(pop_file)) pop = xr.open_dataarray(str(pop_file)) # if there's a draw dimension, take the mean if "draw" in pop.dims: pop = pop.mean("draw") except OSError as ose: LOGGER.error("Cannot open pop {}: {}".format(pop_file, ose)) exit() # we may or may not have draws for past pops, but we should certainly # expect location, age, sex, and year assert {"location_id", "year_id", "age_group_id", "sex_id"}.issubset(set(pop.dims)) if len(pop.year_id) > 1: pop = pop.loc[{"year_id": years.past_end}] else: pop = pop.squeeze(dim="year_id") assert pop.year_id == years.past_end LOGGER.debug("pop {}".format(pop)) # we like age_group_id to be the last dim to expedite later computation. if "draw" in pop.dims: # if past pop has draws, resample. pop = pop.transpose("draw", "location_id", "sex_id", "age_group_id") pop = resample(pop, draws) else: pop = pop.transpose("location_id", "sex_id", "age_group_id") if pop.name is None: pop.name = "population" # asfr etl (draws expected) asfr_gbd_round_id = gbd_round_id if gbd_round_id >= 5 else 5 asfr_file = FBDPath("/{}/future/asfr/{}".format(asfr_gbd_round_id, asfr_version)) / "asfr.nc" try: LOGGER.info("Reading {}".format(asfr_file)) # ASFR is reported per thousand people. asfr = xr.open_dataarray(str(asfr_file)) except OSError as ose: LOGGER.error("Cannot open asfr {}: {}".format(asfr_file, ose)) #exit(2 assert set(asfr.dims) == { "draw", "year_id", "location_id", "scenario", "age_group_id" }, "asfr dims {}".format(asfr.dims) asfr_lim = asfr.sel(year_id=slice(years.past_end, years.forecast_end + 1)) if asfr_lim.name is None: asfr_lim.name = "asfr" asfr_lim = resample(asfr_lim, draws) # lifetable etl (draws expected) lifetables = list() for lfilename in lifetable_version: lifetables.append(read_lifetable(gbd_round_id, lfilename, draws)) if len(lifetables) > 1: lpast, lfuture = (None, None) lyears = [llx.year_id.values for llx in lifetables] if lyears[0][-1] > lyears[1][-1]: lfuture, lpast = lifetables elif lyears[1][-1] > lyears[0][-1]: lpast, lfuture = lifetables elif lyears[0][0] < lyears[1][0]: lpast, lfuture = lifetables elif lyears[1][0] < lyears[0][0]: lfuture, lpast = lifetables else: LOGGER.error("Cannot figure out which is the future lifetable") exit() if years.past_end in lfuture.year_id.values: LOGGER.info("All needed years were in the future lifetable" "Ignoring the past data.") lifetable_lim = lfuture.sel( year_id=slice(years.past_end, years.forecast_end + 1)) lifetable_out = (lifetable_lim, ) else: assert years.past_end in lpast.year_id.values past_slice = lpast.loc[{"year_id": [years.past_end]}] LOGGER.debug("Life past slice {}".format( past_slice.year_id.values)) LOGGER.debug("Life future slice {}".format(lfuture.year_id.values)) lifetable_out = (past_slice, lfuture) else: lifetable_lim = lifetables[0].sel( year_id=slice(years.past_end, years.forecast_end + 1)) lifetable_out = (lifetable_lim, ) # migration etl (no draws expected) try: migration_file = FBDPath("/{}/future/migration/{}".format( gbd_round_id, migration_version[0])) / "migration.nc" except Exception: if os.path.exists(migration_version[0]): migration_file = migration_version[0] else: raise Exception("Cannot construct {}".format(migration_file)) try: LOGGER.info("Reading {}".format(migration_file)) migration = xr.open_dataarray(str(migration_file)) except OSError as ose: LOGGER.error("Cannot open migration {}: {}".format( migration_file, ose)) exit() assert set(("location_id", "age_group_id", "sex_id", "year_id")).\ issubset(migration.dims) # Currently we don't use or make migration scenarios -- if a scenario dim # exists for some reason ensure that only reference is used and that the # scenario dim is dropped. if "scenario" in migration.dims: # scenario dim migration = migration.sel(scenario=0, drop=True) elif "scenario" in migration.coords: # scenario point coord migration = migration.drop("scenario") else: pass # no scenario dim or point coord # if pop has draws, we want migration to have draws as well. # this becomes important in _fill_missing_locations(). if "draw" in pop.dims: if "draw" not in migration.dims: migration = expand_dimensions(migration, draw=pop["draw"]) else: migration = resample(migration, draws) migration = migration.transpose("draw", "location_id", "year_id", "sex_id", "age_group_id") else: # pop has no "draw", so migration doesn't need it either if "draw" in migration.dims: migration = migration.mean("draw") migration = migration.transpose("location_id", "year_id", "sex_id", "age_group_id") if migration.name is None: migration.name = "migration" # Use the last past year's all age population proportions to compute # regional migration averages to fill in missing data. migration_locs_fixed = _clean_migration_locations(migration, pop.sum("age_group_id"), gbd_round_id) LOGGER.info("Read data Elapsed {}".format(perf_time() - data_read_start)) # Migration counts drive small nations to zero population. # This is a way to ensure we show the trend of health. migration_locs_fixed.loc[dict( location_id=list(SMALL_NATIONS_ZERO_MIGRATION.values()))] = 0. LOGGER.debug("Pop from read years {}".format(pop.year_id.values)) # Not FBDPath at the moment since it doesn't recognize covariate as a # valid stage. May need to change location of files. # srb etl (no draws) srp_path = FBDPath("/{}/past/sex_ratio_at_birth/{}".format( gbd_round_id, srb_version)) srb_file = srp_path / "sex_ratio_at_birth.nc" try: LOGGER.info("Reading {}".format(srb_file)) srb = xr.open_dataarray(str(srb_file)) except OSError as ose: LOGGER.error("Cannot open srb {}: {}".format(srb_file, ose)) exit() # Subset to last year of past srb = srb.sel(year_id=years.past_end) return asfr_lim, lifetable_out, pop, migration_locs_fixed, srb
def agreement_rules(asfr, lifetable, pop, migration, srb, years): """ This is where we put all rules for how data from different sources must agree with each other in terms of what domain it lives on. Args: asfr (xr.DataArray): Age-specific fertility rate lifetable (tuple): The lifetable as a tuple of datasets, past and future pop (xr.DataArray): Population for starting year migration (xr.DataArray): Migration values srb (xr.DataArray): Sex ratio at birth years (YearRange): years for past and forecast. Returns: xr.DataArray: ASFR xr.Dataset: Lifetable xr.DataArray: First year of population xr.DataArray: Migration """ # There could be a separate past and future lifetable, so we take part # in order to determine the subsets. Take the second one, the future. part_of_lifetable = lifetable[-1] desired_locations = { int(dl) for dl in fbd_core.db.get_locations_by_level(3).location_id } la, ll, lp, lm = [ set(l.location_id.values) for l in [asfr, part_of_lifetable, pop, migration] ] LOGGER.info( "location id count for asfr {} life {} pop {} migration {}".format( len(la), len(ll), len(lp), len(lm))) loc_all = la & ll & lp # Missing migration will be set to 0. if desired_locations - la: LOGGER.warning("asfr locations missing {}".format(desired_locations - la)) if desired_locations - ll: LOGGER.warning( "lifetable locations missing {}".format(desired_locations - ll)) if desired_locations - lp: LOGGER.warning( "population locations missing {}".format(desired_locations - lp)) if desired_locations - lm: LOGGER.warning( "migration locations missing {}".format(desired_locations - lm)) subset = dict(location_id=np.array(list(sorted(loc_all)), dtype=np.int)) subset["sex_id"] = [1, 2] assert not (set(asfr.age_group_id.values) - set(part_of_lifetable.age_group_id.values)) ages = part_of_lifetable.age_group_id.values[:] al = set(part_of_lifetable.age_group_id.values) ap = set(pop.age_group_id.values) if al ^ ap: LOGGER.info("lifetable ages {} pop ages {}".format(al - ap, ap - al)) ages = [a for a in ages if a in ap] assert consistent_age_group_ids(ages),\ "Ages don't match for ids {}".format(ages) subset["age_group_id"] = ages pop_sub = pop.loc[subset] subset_lives = list() if len(lifetable) > 1: # should be just 2 elements min_draw_count = min([ds["draw"].size for ds in lifetable]) # make sure the subset draw labels are the same assert lifetable[0]["draw"][0:min_draw_count].\ identical(lifetable[1]["draw"][0:min_draw_count]) subset["draw"] = lifetable[0]["draw"][0:min_draw_count] for incoming_life in lifetable: LOGGER.debug("life subsets {}".format(incoming_life)) subset_life = incoming_life.loc[subset] if "scenario" not in subset_life.dims: d3 = xr.concat([subset_life] * 3, dim="scenario") d3.coords["scenario"] = [-1, 0, 1] subset_life = d3 subset_lives.append(subset_life) if len(subset_lives) > 1: lifetable_lim = xr.concat(subset_lives, dim="year_id") else: lifetable_lim = subset_lives[0] life_sub = lifetable_lim.transpose("location_id", "scenario", "draw", "year_id", "sex_id", "age_group_id") assert consistent_age_group_ids(life_sub.age_group_id.values) assert consistent_age_group_ids(pop_sub.age_group_id.values) # Migration will be missing locations, most likely, and may have extras. # Migration will have strict five-year age groups. # This version creates a dataset of zeros. We should give missing countries # a value that is the average over the region. migration_years = migration.year_id.values migration_years = migration_years[migration_years >= years.past_end] if np.in1d(subset["age_group_id"], migration.age_group_id.values).all(): migration_ages = subset["age_group_id"] LOGGER.info("migration using GBD age groups") else: migration_ages = migration.age_group_id.values LOGGER.info("migration using the age groups it has") migration_sub = xr.DataArray( data=np.zeros((len(subset["location_id"]), len(migration_years), 2, len(migration_ages)), dtype=np.double), coords=dict(location_id=subset["location_id"], year_id=migration_years, sex_id=[1, 2], age_group_id=migration_ages), dims=["location_id", "year_id", "sex_id", "age_group_id"]) common_locations = [ l for l in subset["location_id"] if l in migration.location_id.values ] copy_idx = dict(location_id=common_locations, year_id=migration_years, sex_id=[1, 2], age_group_id=migration_ages) LOGGER.debug("migration {} migration_sub {}".format( migration.age_group_id.values, migration_sub.age_group_id.values)) if "draw" in migration.dims: migration_sub = expand_dimensions(migration_sub, draw=migration["draw"]) migration_sub = migration_sub.transpose(*list(migration.dims)) copy_idx["draw"] = migration["draw"].values.tolist() for index_obj in copy_idx.values(): assert len(set(index_obj)) == len(index_obj) for name, coord in migration.coords.items(): assert len(set(coord.values)) == len(coord.values), name LOGGER.debug("migration {}".format(migration)) migration_sub.loc[copy_idx] = migration.loc[copy_idx] if 1 in migration_sub.age_group_id.values: LOGGER.info("Migration WPP Straight") from_under_five = copy_idx.copy() from_under_five["age_group_id"] = [1] five_year = 1 / (5 * 365) for aid, frac in [(2, 7 * five_year), (3, 21 * five_year), (4, 337 * five_year), (5, 4 * 365 * five_year)]: copy_idx["age_group_id"] = [aid] migration_sub.loc[copy_idx] = frac * migration.loc[from_under_five] migration_sized = migration_sub else: LOGGER.info("Migration WPP Smoothed") # It's the one-year migration. # NOTE these age groups were particular of the migration file provided migration_age_ids = migration_sub['age_group_id'].values end_ages_0 = [143, 144, 145, 146, 273] # 95, 96, 97, 98, 99+ end_ages_1 = [235] # 95+ if set(end_ages_0).issubset(migration_age_ids): assert not set(end_ages_1) & set(migration_age_ids) end_ages = end_ages_0 early_ages = [ x for x in migration_sub.age_group_id.values if x not in end_ages ] # sum over these granular age groups to form one terminal age group end_years = migration_sub.loc[{ "age_group_id": end_ages }].sum(dim="age_group_id") end_years.coords["age_group_id"] = 235 end_years.expand_dims("age_group_id", axis=len(end_years.dims)) lop_end = migration_sub.loc[{"age_group_id": early_ages}] migration_sized = xr.concat([lop_end, end_years], dim="age_group_id") # if 235 is the only age group id beyond 142, no change is needed elif set(end_ages_1).issubset(migration_age_ids): assert not set(end_ages_0) & set(migration_age_ids) migration_sized = migration_sub else: raise Exception("end_ages do not exists in migration age_group_id") ordered =\ consistent_age_group_ids(migration_sized.age_group_id.values) if not ordered: raise RuntimeError("Age group ids not ordered") subset.pop("sex_id") subset.pop("age_group_id") asfr_sub = asfr.loc[subset] # We assert order because the internal methods will use numpy arrays. # Locations are first because we will parallelize over locations. # The year and age group are last because we work by draw, so they # are needed for each forecast. asfr_sub = asfr_sub.transpose("location_id", "scenario", "draw", "year_id", "age_group_id") idx = 0 for year_arr in [asfr_sub, life_sub, migration_sized]: assert year_arr.year_id.values[0] == years.past_end, ( "Start is {} years for {} are {}".format(years.past_end, idx, year_arr.year_id.values)) idx += 1 LOGGER.debug("Pop from agree years {}".format(pop_sub.year_id.values)) LOGGER.debug("Life from agree years {}".format(life_sub.year_id.values)) check_out_of_bounds(asfr_sub, "asfr") check_out_of_bounds(life_sub.lx, "life") check_out_of_bounds(pop_sub, "pop_in") check_out_of_bounds(migration_sub, "migration") return asfr_sub, life_sub, pop_sub, migration_sized, srb
def print_pop_stats(pop_base_version, pop_sdg_version, pop_99_version, pop_past_version, gbd_round_id, measure): pop_base_draw_path = FBDPath( f"{gbd_round_id}/future/{measure}/{pop_base_version}") pop_sdg_draw_path = FBDPath( f"{gbd_round_id}/future/{measure}/{pop_sdg_version}") pop_99_draw_path = FBDPath( f"{gbd_round_id}/future/{measure}/{pop_99_version}") pop_base_path = FBDPath( f"{gbd_round_id}/future/{measure}/{pop_base_version}_combined") pop_sdg_path = FBDPath( f"{gbd_round_id}/future/{measure}/{pop_sdg_version}_combined") pop_99_path = FBDPath( f"{gbd_round_id}/future/{measure}/{pop_99_version}_combined") pop_past_path = FBDPath( f"{gbd_round_id}/past/{measure}/{pop_past_version}") pop_past = open_xr(f"{pop_past_path}/{measure}_agg.nc").data pop_draw_base = open_xr(f"{pop_base_draw_path}/{measure}_agg.nc").data pop_draw_sdg = open_xr(f"{pop_sdg_draw_path}/{measure}_agg.nc").data pop_draw_99 = open_xr(f"{pop_99_draw_path}/{measure}_agg.nc").data pop_base = open_xr(f"{pop_base_path}/{measure}_combined.nc").data pop_sdg = open_xr(f"{pop_sdg_path}/{measure}_combined.nc").data pop_99 = open_xr(f"{pop_99_path}/{measure}_combined.nc").data pop_slower_2100 = pop_base.sel( scenario=-1, sex_id=3, location_id=1, age_group_id=22, year_id=2100) / 1e9 pop_sdg_2100 = pop_sdg.sel( scenario=-1, sex_id=3, location_id=1, age_group_id=22, year_id=2100) / 1e9 pop_faster_2100 = pop_base.sel( scenario=1, sex_id=3, location_id=1, age_group_id=22, year_id=2100) / 1e9 pop_fastest_2100 = pop_99.sel( scenario=1, sex_id=3, location_id=1, age_group_id=22, year_id=2100) / 1e9 pop_ref_2100 = pop_base.sel( scenario=0, sex_id=3, location_id=1, age_group_id=22, year_id=2100) / 1e9 pop_ref = pop_base.sel( scenario=0, sex_id=3, location_id=1, age_group_id=22) / 1e9 pop_sdg_draw_2100 = pop_draw_sdg.sel( scenario=-1, sex_id=3, location_id=1, age_group_id=22, year_id=2100) / 1e9 pop_slower_draw_2100 = pop_draw_base.sel( scenario=-1, sex_id=3, location_id=1, age_group_id=22, year_id=2100) / 1e9 pop_fastest_draw_2100 = pop_draw_99.sel( scenario=1, sex_id=3, location_id=1, age_group_id=22, year_id=2100) / 1e9 ref_peak_year = find_peak_year(pop_ref) ref_peak_da = pop_ref.sel(year_id=ref_peak_year) ref_peak_mean, ref_peak_lower, ref_peak_upper = \ return_mean_and_quantiles(ref_peak_da) ref_2100_mean, ref_2100_lower, ref_2100_upper = \ return_mean_and_quantiles(pop_ref_2100) slower_2100_mean, slower_2100_lower, slower_2100_upper = \ return_mean_and_quantiles(pop_slower_2100) sdg_2100_mean, sdg_2100_lower, sdg_2100_upper = \ return_mean_and_quantiles(pop_sdg_2100) faster_2100_mean, faster_2100_lower, faster_2100_upper = \ return_mean_and_quantiles(pop_faster_2100) fastest_2100_mean, fastest_2100_lower, fastest_2100_upper = \ return_mean_and_quantiles(pop_fastest_2100) if sdg_2100_mean < fastest_2100_mean: diff_mean, diff_lower, diff_upper = calculate_diff( pop_slower_draw_2100, pop_sdg_draw_2100) else: diff_mean, diff_lower, diff_upper = calculate_diff( pop_slower_draw_2100, pop_fastest_draw_2100) median_age_2017 = get_median_age(pop_past, gbd_round_id, 2017) median_age_2100_mean, median_age_2100_lower, median_age_2100_upper = \ get_median_age(pop_draw_base, gbd_round_id, 2100) pop_under5_2017 = pop_past.sel( sex_id=3, location_id=1, age_group_id=1, year_id=2017) / 1e6 pop_under5_2100 = pop_base.sel( scenario=0, sex_id=3, location_id=1, age_group_id=1, year_id=2100) / 1e6 pop_over80_2017 = pop_past.sel( sex_id=3, location_id=1, age_group_id=21, year_id=2017) / 1e6 pop_over80_2100 = pop_base.sel( scenario=0, sex_id=3, location_id=1, age_group_id=21, year_id=2100) / 1e6 age_under5_2017 = pop_under5_2017.values.round(2) age_under5_2100_mean, age_under5_2100_lower, age_under5_2100_upper = \ return_mean_and_quantiles(pop_under5_2100) age_over80_2017 = pop_over80_2017.values.round(2) age_over80_2100_mean, age_over80_2100_lower, age_over80_2100_upper = \ return_mean_and_quantiles(pop_over80_2100) pop_under5_2017_draws = expand_dimensions(pop_under5_2017, draw=range(1000)) pop_base_draw_under5_2100 = pop_draw_base.sel( scenario=0, sex_id=3, location_id=1, age_group_id=1, year_id=2100).squeeze().drop("scenario") / 1e6 percent_drop = (pop_under5_2017_draws - pop_base_draw_under5_2100) / pop_under5_2017_draws drop_percent_mean = (percent_drop.mean("draw") * 100).values.round(2) drop_percent_lower = (percent_drop.quantile(0.025, dim="draw") * 100).values.round(2) drop_percent_upper = (percent_drop.quantile(0.975, dim="draw") * 100).values.round(2) print( f"Combining the scenarios for mortality, fertility, and migration, " f"we expect global population in the reference scenario to peak at " f"{ref_peak_mean} (95% UI {ref_peak_lower}-{ref_peak_upper}) billion " f"in the year {ref_peak_year} and then decline to {ref_2100_mean} " f"({ref_2100_lower}-{ref_2100_upper}) billion in 2100. " f"Across alternative scenarios, the range in 2100 is from " f"{slower_2100_mean} billion " f"({slower_2100_lower}–{slower_2100_upper}) in the slower met need " f"and education scenario to {sdg_2100_mean} billion " f"({sdg_2100_lower}-{sdg_2100_upper}) in the SDG-pace scenario for " f"education and contraceptive met need (figure 6)." f"The faster and fastest scenarios give 2100 global populations of " f"{faster_2100_mean} ({faster_2100_lower}-{faster_2100_upper}) and " f"{fastest_2100_mean} ({fastest_2100_lower}-{fastest_2100_upper}) " f"billion, respectively. Peak population in the SDG scenario is in " f"2046, while the global population continues to grow through the " f"century in the slower scenario. The huge differences in TFR in " f"2100 across the scenarios translates into differences of {diff_mean} " f"({diff_lower}-{diff_upper}) billion people in 2100. " f"Median age will increase in the reference scenario from " f"{median_age_2017} in 2017 to {median_age_2100_mean} " f"({median_age_2100_lower}-{median_age_2100_upper}) in 2100 " f"The number of children under age 5 will decline from {age_under5_2017} " f"million in 2017 to only {age_under5_2100_mean} " f"({age_under5_2100_lower}-{age_under5_2100_upper}) in 2100, a drop " f"of {drop_percent_mean}% " f"({drop_percent_lower}-{drop_percent_upper}) million. At the same " f"time, the number of individuals aged over 80 will will increase " f"from {age_over80_2017} million in 2017 to {age_over80_2100_mean} " f"({age_over80_2100_lower}-{age_over80_2100_upper}) in 2100.")
def arc_forecast_education(past, gbd_round_id, transform, weight_exp, years, reference_scenario, diff_over_mean, truncate, truncate_quantiles, replace_with_mean, extra_dim=None): """Forecasts education using the ARC method. Args: past (xarray.DataArray): Past data with dimensions ``location_id``, ``sex_id``, ``age_group_id``, ``year_id``, and ``draw``. transform (xarray.DataArray): Space to transform education to for forecasting. weight_exp (float): How much to weight years based on recency years (YearRange): Forecasting timeseries. reference_scenario (str): If 'median' then the reference scenarios is made using the weighted median of past annualized rate-of-change across all past years, 'mean' then it is made using the weighted mean of past annualized rate-of-change across all past years diff_over_mean (bool): If True, then take annual differences for means-of-draws, instead of draws. truncate (bool): If True, then truncates the dataarray over the given dimensions. truncate_quantiles (object, optional): The tuple of two floats representing the quantiles to take replace_with_mean (bool, optional): If True and `truncate` is True, then replace values outside of the upper and lower quantiles taken across "location_id" and "year_id" and with the mean across "year_id", if False, then replace with the upper and lower bounds themselves. gbd_round_id (int): The GBD round of the input data. Returns: (xarray.DataArray): Education forecasts """ LOGGER.debug("diff_over_mean:{}".format(diff_over_mean)) LOGGER.debug("truncate:{}".format(truncate)) LOGGER.debug("truncate_quantiles:{}".format(truncate_quantiles)) LOGGER.debug("replace_with_mean:{}".format(replace_with_mean)) LOGGER.debug("reference_scenario:{}".format(reference_scenario)) most_detailed_coords = _get_avail_most_detailed_coords(past, gbd_round_id) most_detailed_past = past.sel(**most_detailed_coords) zeros_dropped = most_detailed_past.where(most_detailed_past > 0) for dim in zeros_dropped.dims: zeros_dropped = zeros_dropped.dropna(dim=dim, how="all") LOGGER.debug("Transforming the past to {}-space".format(transform)) transformed_past = TRANSFORMATIONS[transform](zeros_dropped) LOGGER.debug("Forecasting education in the transformed space") transformed_forecast = scenarios.arc_method( transformed_past, gbd_round_id=gbd_round_id, years=years, reference_scenario=reference_scenario, weight_exp=weight_exp, diff_over_mean=diff_over_mean, truncate=truncate, truncate_quantiles=truncate_quantiles, replace_with_mean=replace_with_mean, reverse_scenarios=True, extra_dim=extra_dim, scenario_roc="national") LOGGER.debug("Converting the forecasts to normal/identity space") forecast = INVERSE_TRANSFORMATIONS[transform](transformed_forecast) refilled_forecast = etl.expand_dimensions(forecast, **most_detailed_coords) lagged_scenarios = lag_scenarios(refilled_forecast, years) # Since past does get clipped to avoid infs and negative infs, we need to # append the actual past onto the data being saved (modelers currently # expect the past to be there) past_broadcast_scenarios = etl.expand_dimensions( most_detailed_past, scenario=lagged_scenarios["scenario"]) all_data = past_broadcast_scenarios.combine_first(lagged_scenarios) bound_err_msg = "the forecasts have NaNs" assert not np.isnan(all_data).any(), bound_err_msg if np.isnan(all_data).any(): LOGGER.error(bound_err_msg) raise RuntimeError(bound_err_msg) return all_data