コード例 #1
0
def save_y_star(eps_version, arima_version, years, measure, draws, decay,
                gbd_round_id):
    """
    apply random walk and save the output
    """

    ds = open_xr(eps_path).data
    try:
        eps_preds = open_xr(f"{mig_dir}/eps_star.nc").data
    except Exception:
        eps_preds = arima_migration(ds, years, draws, decay)
        epsilon_hat_out = mig_dir / "eps_star.nc"
        save_xr(eps_preds, epsilon_hat_out, metric="rate", space="identity")

    # cap residuals between 10 and -10
    # the population forecasts to 2100 is decreasing to 0 with current
    # forecasts from migration for Syria, Latvia and Jamaica, the capping
    # method helps to make things more reasonable
    eps_past = eps_preds.sel(year_id=years.past_years)
    eps_preds = eps_preds.sel(year_id=years.forecast_years)
    eps_preds = eps_preds.clip(min=-10, max=10)
    eps_preds = xr.concat([eps_past, eps_preds], dim="year_id")

    pred_path = mig_dir / "mig_hat.nc"
    preds = open_xr(pred_path).data
    preds = preds.sel(year_id=years.years)
    preds = expand_dimensions(preds, draw=range(0, draws))
    y_star = preds + eps_preds

    save_xr(y_star, ystar_out, metric="rate", space="identity")
コード例 #2
0
def _expand_5yr_age_groups_to_1yr_ages(da, ages_df):
    """
    Converts 5-year age groups to 1-year ages, by simply repeating the same
    value.

    Args:
        da (xr.DataArray): da with "age_group_id" dim.
        ages_df (pd.DataFrame): df with age group metadata.

    Returns:
        (xr.DataArray) da where "age_group_id" dim is replaced with "age".
    """
    assert "age_group_id" in da.dims, "Missing age_group_id dim"
    for col in [
            "age_group_id", "age_group_years_start", "age_group_years_end"
    ]:
        assert col in ages_df.columns, f"Missing {col} column"
    assert da["age_group_id"].isin(ages_df["age_group_id"]).all(),\
        "Not all age group ids are available in metadata"
    das = []
    for age_group_id in da["age_group_id"].values:
        lower_age = int(
            ages_df.query("age_group_id == @age_group_id")
            ["age_group_years_start"])
        upper_age = int(
            ages_df.query("age_group_id == @age_group_id")
            ["age_group_years_end"])
        sub_da = da.sel(age_group_id=age_group_id).drop("age_group_id")
        sub_da = expand_dimensions(sub_da,
                                   age=range(int(lower_age), int(upper_age)))
        das.append(sub_da)

    return xr.concat(das, dim="age")
コード例 #3
0
 def predict(self):
     """
     Generate predictions based on model fit.
     """
     locations = self.dataset[self.y].location_id.values
     ages = self.dataset[self.y].age_group_id.values
     sexs = self.dataset[self.y].sex_id.values
     location_data_list = []
     for location_id in locations:
         age_data_list = []
         for age_group_id in ages:
             sex_data_list = []
             for sex_id in sexs:
                 forecast = self.predict_single_ts(location_id,
                                                   age_group_id, sex_id)
                 sex_data_list.append(forecast)
             age_data_list.append(xr.concat(sex_data_list, dim="sex_id"))
         location_data_list.append(
             xr.concat(age_data_list, dim="age_group_id"))
     all_preds = xr.concat(location_data_list, dim="location_id")
     past = self.dataset.y
     try:
         past = past.drop(["acause", "scenario"])
     except ValueError:
         pass
     past = expand_dimensions(past, draw=range(0, self.draws))
     all_preds = xr.concat([past, all_preds], dim="year_id")
     return all_preds
コード例 #4
0
def melt_to_xarray(df):
    """Melts GBD data with 'mean', 'lower', and 'upper' columns to a single
    'quantile' column; converts to xarray dataarray; and adds a scenario
    dimension. 

    Args:
        df (pandas dataframe):
            Dataframe with 'year_id', 'location_id', 'mean', 'lower', and
            'upper' columns.
    Returns:
        da_with_scenario (xarray dataarray):
            Dataarray with 'year_id', 'quantile', 'location_id', and 'scenario'
            dimensions.
    """
    df_long = pd.melt(df,
                      id_vars=["year_id", "location_id"],
                      value_vars=["mean", "lower", "upper"],
                      var_name="quantile")
    
    da = df_long.set_index(
         ["year_id", "quantile", "location_id"]).to_xarray()["value"]
    
    da_with_scenario = expand_dimensions(da, scenario=[0])
    
    return da_with_scenario
コード例 #5
0
def _qx_to_lx(qx):
    r"""
    Computes :math:`l_x` based on :math:`q_x`, where :math:`q_x` already
    contains the 95-100 (33) and 100-105 (44) age groups.  Also computes
    :math:`l_x` for 105-110 (45), and then set :math:`l_x` for 110+ to be 0.

    Args:
        qx (xr.DataArray): Probability of dying.

    Returns:
        (xr.DataArray): lx.
    """
    if tuple(qx["age_group_id"].values[-2:]) != (33, 44):
        raise ValueError("qx must have age group ids 33 and 44")

    px = 1.0 - qx  # now we have survival all the way to 100-105 (44) age group

    # Because l{x+n} = lx * px, we can compute all lx's if we start with
    # l_0 = 1 and iteratively apply the px's of higher age groups.
    # So we compute l_105-110, since we have p_100-105 from extrapolated qx.
    # We start with a set of lx's that are all 1.0
    lx = xr.full_like(px, 1)
    # now expand lx to have age groups 105-110 (45)
    lx = expand_dimensions(lx, fill_value=1, age_group_id=[45])

    # Since l{x+n} = lx * px, we make cumulative prduct of px down age groups
    # and apply the product to ages[1:] (since ages[0]) has lx = 1.0
    ages = lx["age_group_id"]

    ppx = px.cumprod(dim="age_group_id")  # the cumulative product of px
    ppx.coords["age_group_id"] = ages[1:]  # need to correspond to ages[1:]
    lx.loc[dict(age_group_id=ages[1:])] *= ppx  # lx all the way to 100-105

    # now artificially sets lx to be 0 for the 110+ age group.
    lx = expand_dimensions(lx, fill_value=0, age_group_id=[148])

    assert (lx.sel(age_group_id=2) == 1).all()
    assert tuple(lx['age_group_id'].values[-4:]) == (33, 44, 45, 148),\
        "final lx should have age group ids 33, 44, 45, and 148."

    return lx
コード例 #6
0
def load_pop(gbd_round_id, past_version, forecast_version):

    forecast_file = FBDPath(
        f"/{gbd_round_id}/future/population/"
        f"{forecast_version}/population_combined.nc")

    past_file = FBDPath(
        f"/{gbd_round_id}/past/population/{past_version}/population.nc")

    future_pop = open_xr(forecast_file).data
    past_pop = expand_dimensions(open_xr(past_file).data,
                                 scenario=future_pop.scenario,
                                 quantile = future_pop["quantile"])
    pop = xr.concat([past_pop, future_pop],
                    "year_id")

    return pop
コード例 #7
0
def prep_pop_da(past_version, forecast_version, gbd_round_id, years):
    forecast_pop_file = FBDPath(
        f"/{gbd_round_id}/future/population/{forecast_version}/"
        f"population_combined.nc")
    forecast_fhs = open_xr(forecast_pop_file).data.sel(quantile='mean',
                                                       drop=True)

    past_fhs_file = FBDPath(
        f"/{gbd_round_id}/past/population/{past_version}/population.nc")
    past_fhs = expand_dimensions(open_xr(past_fhs_file).data.sel(
        year_id=years.past_years,
        sex_id=forecast_fhs["sex_id"],
        age_group_id=forecast_fhs["age_group_id"],
        location_id=forecast_fhs["location_id"]),
                                 scenario=forecast_fhs.scenario.values)

    fhs_all_scenarios = xr.concat([past_fhs, forecast_fhs], dim="year_id")

    fhs = fhs_all_scenarios.sel(scenario=[-1, 0, 1])
    alt_sdg = fhs_all_scenarios.sel(scenario=[3])
    alt_99 = fhs_all_scenarios.sel(scenario=[2])

    ages = db.get_ages().query("age_group_id in @ALL_AGE_GROUP_IDS")
    days = ages[["age_group_id", "age_group_days_start", "age_group_days_end"]]
    days["mean_age"] = (days["age_group_days_end"] -
                        (days["age_group_days_end"] -
                         days["age_group_days_start"]) / 2) / 365.25
    mean_age = days.set_index("age_group_id")["mean_age"].to_xarray()

    data_fhs = fhs.sel(age_group_id=mean_age["age_group_id"], sex_id=SEX_IDS)
    data_sdg = alt_sdg.sel(age_group_id=mean_age["age_group_id"],
                           sex_id=SEX_IDS)
    data_99 = alt_99.sel(age_group_id=mean_age["age_group_id"], sex_id=SEX_IDS)

    avg_age_fhs = (data_fhs *
                   mean_age).sum("age_group_id") / data_fhs.sum("age_group_id")
    avg_age_sdg = (data_sdg *
                   mean_age).sum("age_group_id") / data_sdg.sum("age_group_id")
    avg_age_99 = (data_99 *
                  mean_age).sum("age_group_id") / data_99.sum("age_group_id")

    ds = data_fhs.rename("population").to_dataset()
    ds_sdg = data_sdg.rename("population").to_dataset()
    ds_99 = data_99.rename("population").to_dataset()

    return avg_age_fhs, avg_age_sdg, avg_age_99, ds, ds_sdg, ds_99
コード例 #8
0
def get_pop(forecast_pop_version, gbd_round_id, measure, draws, years,
            past_pop_version):
    """Pulls specified version of populations, subsets to fertile age groups
    and females only if meausre is live_births.

    Args:
        gbd_round_id (int):
            The GBD round fed into FBDPath to pull the correct version of pops
        forecast_pop_version (str):
            The version name of the populations file used in FBDPath.
        draws (int):
            The number of desired draws. This goes into resample, so we get
            pops with the correct number of draws.
    Returns:
        (xarray.DataArray):
            Fertile forecast population. The ``age_group_id`` dimension
            includes coordinates for each of the fertile age-groups.
    """
    forecast_pop_path = FBDPath(
        f"{gbd_round_id}/future/population/{forecast_pop_version}")
    forecast_pop_file = forecast_pop_path / "population.nc"
    forecast_pop = open_xr(forecast_pop_file).data
    past_pop_path = FBDPath(
        f"{gbd_round_id}/past/population/{past_pop_version}")
    past_pop_file = past_pop_path / "population.nc"
    past_pop = open_xr(past_pop_file).data
    past_pop = past_pop.sel(sex_id=forecast_pop.sex_id.values)
    past_pop = expand_dimensions(past_pop, draw=range(draws))
    forecast_pop = concat_past_future(past_pop, forecast_pop, draws, years)

    if measure == "live_births":
        forecast_pop = forecast_pop.sel(
            age_group_id=list(FERTILE_AGE_GROUP_IDS),
            sex_id=2).drop(["sex_id"])
    else:
        forecast_pop = forecast_pop.sel(sex_id=[1, 2])

    return forecast_pop
コード例 #9
0
def get_maternal_edu(education, gbd_round_id,
                     past_future, pop_version, location_ids):
    """Recalculate maternal education, which according to the education team is
    the education of women of age-group-IDs 8 to 14 multiplied by their
    age-weights and then summed over age.

    Only the age weights of groups 8 to 14 are kept, and then are rescaled so
    that the sum of those age weights is 1.

    Args:
        education (xarray.DataArray):
            Education data. Needs dimensions `age_group_id` and `sex_id`,
            but probably also has dimensions `location_id`, `draw`, `year_id`
            and maybe `scenario`.
        gbd_round_id (int):
            Numeric ID for the GBD round. Used to get the age-weights for the
            round from the database.
        past_pop_version (str):
            Version of past population to use for maternal education
            aggregation.
        future_pop_version (str):
            Version of future population to use for maternal education
            aggregation.
    Returns:
        (tuple[xarray.DataArray, xarray.DataArray]):
            * The first `xarray.DataArray` of the tuple is educational
              attainment for all age-groups and sexes. However, children that
              are too young to have their own education are filled in with
              maternal education.
            * The second `xarray.DataArray` of the tuple is maternal education
              -- only for the maternal age-group, given by `MAT_AGE_GROUP_ID`
              and females, given by `FEMALE_SEX_ID`.
    """

    pop_path = FBDPath("")  # Path removed for security reasons

    pop = open_xr(pop_path / "population.nc").data.sel(
        age_group_id=list(MAT_AGE_GROUPS), sex_id=FEMALE_SEX_ID,
        location_id=list(location_ids)
        )

    LOGGER.debug("Adding up education of moms to get maternal education.")
    mat_slice_edu = education.sel(sex_id=FEMALE_SEX_ID,
                                  age_group_id=list(MAT_AGE_GROUPS),
                                  location_id=list(location_ids))

    agg = Aggregator(pop)
    mat_edu = agg.aggregate_ages(list(MAT_AGE_GROUPS), MAT_AGE_GROUP_ID,
                                 data=mat_slice_edu).rate

    # age_group_id must be dropped. If not, expand_dimensions will broadcast
    # NaNs instead of our data into the new child age_group_id values.
    mat_edu_expanded = expand_dimensions(mat_edu.drop("age_group_id").squeeze(),
                                         sex_id=list(SEXES),
                                         age_group_id=list(CHILD_AGE_GROUPS))

    LOGGER.debug("Adding maternal education for both sexes and child age "
                 "groups to education data array.")
    # Even if ``education`` has data for child age groups, combine first will
    # make sure that the newly calculated maternal education will be used
    # instead.
    return mat_edu_expanded.combine_first(education), mat_edu
コード例 #10
0
def main(asfr_version, past_asfr_version, location_id, gbd_round_id, years,
         granularity, iterations, **kwargs):
    """
    1. Read in location-specific draws of period ASFR from CCF stage
    2. Add terminal age group ASFR's
    3. Intercept shift asfr by holding CCF50 constant.
    4. Export location-specific intercept-shifted ASFR in .nc

    Args:
        asfr_version (str): version name of future ccf/asfr.
        past_asfr_version (str): asfr version from past.
        location_id (int): location_id.
        gbd_round_id (int): gbd round id.
        years (YearRange): past_start:forecast_start:forecast_end
        iterations (int): number of times to intercept-shift.
    """
    ages_df = db.get_ages(gbd_round_id)[[
        "age_group_id", "age_group_years_start", "age_group_years_end"
    ]]

    # read the location-specific asfr .csv into dataarray
    # the raw forecasted ASFR are stored in the CCF stage of the same
    ccf_fbd_path = FBDPath(gbd_round_id=gbd_round_id,
                           past_or_future="future",
                           stage="ccf",
                           version=asfr_version)
    if granularity == 1:
        sub_folder = "asfr_single_year"
        ccf_asfr_fbd_path = ccf_fbd_path / sub_folder
        future_asfr = read_to_xr(location_id,
                                 ccf_asfr_fbd_path,
                                 dims=list(ASFR_NON_AGE_DIMS + ("age", )))
    else:
        sub_folder = "asfr"
        ccf_asfr_fbd_path = ccf_fbd_path / sub_folder
        future_asfr =\
            read_to_xr(location_id, ccf_asfr_fbd_path,
                       dims=list(ASFR_NON_AGE_DIMS + ("age_group_id",)))
        # we intercept-shift in 1-year ages, so convert to single years
        future_asfr = _expand_5yr_age_groups_to_1yr_ages(future_asfr, ages_df)

    if "sex_id" in future_asfr.dims:
        raise ValueError("Found sex_id dim in future asfr")

    # now etl the past asfr data
    past_asfr_fbd_path = FBDPath(gbd_round_id=gbd_round_id,
                                 past_or_future="past",
                                 stage="asfr",
                                 version=past_asfr_version)
    past_asfr =\
        open_xr(past_asfr_fbd_path /
                "asfr.nc").data.sel(location_id=location_id)

    if "sex_id" in past_asfr.dims:
        raise ValueError("Found sex_id dim in past asfr")

    # past has no scenarios, so we need to expand it for merging
    past_asfr = expand_dimensions(past_asfr, scenario=future_asfr["scenario"])

    # past asfr has age group ids 7-15, but future asfr in ccf only has 8-14.
    # we only need age groups 8-14 for intercept shift
    past_asfr_1yrs = _expand_5yr_age_groups_to_1yr_ages(
        past_asfr.sel(age_group_id=range(8, 15)), ages_df)

    # now ready to concat past and future together for intercept shift
    asfr = xr.concat([
        past_asfr_1yrs.sel(year_id=years.past_years),
        future_asfr.sel(year_id=years.forecast_years)
    ],
                     dim="year_id")

    del past_asfr_1yrs, future_asfr
    gc.collect()

    # the intercept-shift should keep ccf50 (asfr sum) constant
    pre_fix_asfr_sum = asfr.sum()  # sum of all asfr values before shift

    asfr = ccf50_intercept_shift_lpf(asfr, gbd_round_id, years, iterations)

    post_fix_asfr_sum = asfr.sum()  # asfr sum post-shift should stay the same

    assert np.isclose(post_fix_asfr_sum, pre_fix_asfr_sum, rtol=RTOL),\
        f"The intercept shift changed total asfr sum by more than rtol={RTOL}"

    # need to save years.past_end for cohort-component model
    save_years = [years.past_end] + years.forecast_years.tolist()
    asfr = asfr.sel(year_id=save_years)  # only do forecast
    # convert forecasted asfr back to 5-year age groups
    asfr = _convert_ages_to_5_year_age_groups_by_mean(asfr, ages_df)
    # add 10-15 (7) and 50-55 (15) age groups for forecasted asfr
    asfr = extrapolate_terminal_asfr_age_groups(past_asfr,
                                                asfr,
                                                last_year=years.past_end)
    asfr["location_id"] = location_id
    asfr.name = "value"

    del past_asfr
    gc.collect()

    LOGGER.info("Finished CCF50 intercept-shift")

    asfr_fbd_path = FBDPath(gbd_round_id=gbd_round_id,
                            past_or_future="future",
                            stage="asfr",
                            version=asfr_version)

    save_xr(asfr,
            asfr_fbd_path / f"{location_id}.nc",
            metric="rate",
            space="identity",
            version=asfr_version,
            past_asfr_version=past_asfr_version,
            iterations=iterations)
コード例 #11
0
lex_fut = open_xr(lex_fut_path / "lifetable_ds_agg.nc").data["ex"].sel(
    year_id=range(2018, 2101), sex_id=3, age_group_id=2, scenario=0)

lex = lex_past.combine_first(lex_fut).drop(
    ["sex_id", "age_group_id", "scenario"]).squeeze()

lex_mean_ui = lex.rename("value").to_dataset()
compute_summaries(lex_mean_ui)
lex_mean_ui = lex_mean_ui[["mean", "upper",
                           "lower"]].drop("quantile").squeeze()
lex_mean = lex_mean_ui["mean"]

pop_past = open_xr(pop_past_path / "population_agg.nc").data.sel(
    year_id=range(1990, 2018), sex_id=3, age_group_id=22)
pop_past = expand_dimensions(pop_past, quantile=["mean", "lower", "upper"])

pop_fut = open_xr(pop_fut_path / "population_combined.nc").data.sel(
    year_id=range(2018, 2101), sex_id=3, age_group_id=22, scenario=0)

pop = pop_past.combine_first(pop_fut).sel(
    location_id=NATS.location_id.tolist())


def get_rate_of_change(ex, year_start, year_end):
    # Get ARC

    lex_start = ex.sel(year_id=year_start).drop("year_id").squeeze()
    lex_end = ex.sel(year_id=year_end).drop("year_id").squeeze()
    rate_change = (lex_start - lex_end) / (year_end - year_start)
    return rate_change
コード例 #12
0
def read_datasets(asfr_version, gbd_round_id, lifetable_version, pop_version,
                  migration_version, years, srb_version, draws):
    """
    This reads files, orders their axes, and ensures that data arrays
    aren't presented as datasets. This enforces rules about how many
    files get read, how they are found, and how they are assembled into
    the incoming data. It doesn't address what the data means.

    Args:
        asfr_version (str): Version string for ASFR
        gbd_round_id (int): GBD Round as an integer
        lifetable_version (list[str]): Lifetable version
        pop_version (str): Population start version
        migration_version (list[str]): Migration version
        years (YearRange): years for past and forecast
        srb_version (str): sex ratio at birth version
        draws (int): the number of draws to take from the future versions.

    Returns:
        xr.DataArray: ASFR
        tuple: Either one lifetable file or (past, futue).
        xr.DataArray: Starting population
        xr.DataArray: Migration
        xr.DataArray: SRB
    """
    # Do this in a subroutine so it's memory can be released.
    # pop etl (pop version is in the past)
    data_read_start = perf_time()
    pop_file = FBDPath("/{}/past/population/{}".format(
        gbd_round_id, pop_version)) / "population.nc"
    try:
        LOGGER.info("Reading {}".format(pop_file))
        pop = xr.open_dataarray(str(pop_file))
        # if there's a draw dimension, take the mean
        if "draw" in pop.dims:
            pop = pop.mean("draw")
    except OSError as ose:
        LOGGER.error("Cannot open pop {}: {}".format(pop_file, ose))
        exit()

    # we may or may not have draws for past pops, but we should certainly
    # expect location, age, sex, and year
    assert {"location_id", "year_id", "age_group_id",
            "sex_id"}.issubset(set(pop.dims))
    if len(pop.year_id) > 1:
        pop = pop.loc[{"year_id": years.past_end}]
    else:
        pop = pop.squeeze(dim="year_id")
        assert pop.year_id == years.past_end
    LOGGER.debug("pop {}".format(pop))

    # we like age_group_id to be the last dim to expedite later computation.
    if "draw" in pop.dims:  # if past pop has draws, resample.
        pop = pop.transpose("draw", "location_id", "sex_id", "age_group_id")
        pop = resample(pop, draws)
    else:
        pop = pop.transpose("location_id", "sex_id", "age_group_id")

    if pop.name is None:
        pop.name = "population"

    # asfr etl (draws expected)

    asfr_gbd_round_id = gbd_round_id if gbd_round_id >= 5 else 5
    asfr_file = FBDPath("/{}/future/asfr/{}".format(asfr_gbd_round_id,
                                                    asfr_version)) / "asfr.nc"
    try:
        LOGGER.info("Reading {}".format(asfr_file))
        # ASFR is reported per thousand people.
        asfr = xr.open_dataarray(str(asfr_file))
    except OSError as ose:
        LOGGER.error("Cannot open asfr {}: {}".format(asfr_file, ose))
        #exit(2

    assert set(asfr.dims) == {
        "draw", "year_id", "location_id", "scenario", "age_group_id"
    }, "asfr dims {}".format(asfr.dims)
    asfr_lim = asfr.sel(year_id=slice(years.past_end, years.forecast_end + 1))
    if asfr_lim.name is None:
        asfr_lim.name = "asfr"

    asfr_lim = resample(asfr_lim, draws)

    # lifetable etl (draws expected)
    lifetables = list()
    for lfilename in lifetable_version:
        lifetables.append(read_lifetable(gbd_round_id, lfilename, draws))
    if len(lifetables) > 1:
        lpast, lfuture = (None, None)
        lyears = [llx.year_id.values for llx in lifetables]
        if lyears[0][-1] > lyears[1][-1]:
            lfuture, lpast = lifetables
        elif lyears[1][-1] > lyears[0][-1]:
            lpast, lfuture = lifetables
        elif lyears[0][0] < lyears[1][0]:
            lpast, lfuture = lifetables
        elif lyears[1][0] < lyears[0][0]:
            lfuture, lpast = lifetables
        else:
            LOGGER.error("Cannot figure out which is the future lifetable")
            exit()

        if years.past_end in lfuture.year_id.values:
            LOGGER.info("All needed years were in the future lifetable"
                        "Ignoring the past data.")
            lifetable_lim = lfuture.sel(
                year_id=slice(years.past_end, years.forecast_end + 1))
            lifetable_out = (lifetable_lim, )
        else:
            assert years.past_end in lpast.year_id.values
            past_slice = lpast.loc[{"year_id": [years.past_end]}]
            LOGGER.debug("Life past slice {}".format(
                past_slice.year_id.values))
            LOGGER.debug("Life future slice {}".format(lfuture.year_id.values))
            lifetable_out = (past_slice, lfuture)
    else:
        lifetable_lim = lifetables[0].sel(
            year_id=slice(years.past_end, years.forecast_end + 1))

        lifetable_out = (lifetable_lim, )

    # migration etl (no draws expected)
    try:
        migration_file = FBDPath("/{}/future/migration/{}".format(
            gbd_round_id, migration_version[0])) / "migration.nc"
    except Exception:
        if os.path.exists(migration_version[0]):
            migration_file = migration_version[0]
        else:
            raise Exception("Cannot construct {}".format(migration_file))

    try:
        LOGGER.info("Reading {}".format(migration_file))
        migration = xr.open_dataarray(str(migration_file))
    except OSError as ose:
        LOGGER.error("Cannot open migration {}: {}".format(
            migration_file, ose))
        exit()
    assert set(("location_id", "age_group_id", "sex_id", "year_id")).\
           issubset(migration.dims)

    # Currently we don't use or make migration scenarios -- if a scenario dim
    # exists for some reason ensure that only reference is used and that the
    # scenario dim is dropped.
    if "scenario" in migration.dims:  # scenario dim
        migration = migration.sel(scenario=0, drop=True)
    elif "scenario" in migration.coords:  # scenario point coord
        migration = migration.drop("scenario")
    else:
        pass  # no scenario dim or point coord

    # if pop has draws, we want migration to have draws as well.
    # this becomes important in _fill_missing_locations().
    if "draw" in pop.dims:
        if "draw" not in migration.dims:
            migration = expand_dimensions(migration, draw=pop["draw"])
        else:
            migration = resample(migration, draws)
        migration = migration.transpose("draw", "location_id", "year_id",
                                        "sex_id", "age_group_id")
    else:  # pop has no "draw", so migration doesn't need it either
        if "draw" in migration.dims:
            migration = migration.mean("draw")
        migration = migration.transpose("location_id", "year_id", "sex_id",
                                        "age_group_id")

    if migration.name is None:
        migration.name = "migration"
    # Use the last past year's all age population proportions to compute
    # regional migration averages to fill in missing data.
    migration_locs_fixed = _clean_migration_locations(migration,
                                                      pop.sum("age_group_id"),
                                                      gbd_round_id)

    LOGGER.info("Read data Elapsed {}".format(perf_time() - data_read_start))

    # Migration counts drive small nations to zero population.
    # This is a way to ensure we show the trend of health.
    migration_locs_fixed.loc[dict(
        location_id=list(SMALL_NATIONS_ZERO_MIGRATION.values()))] = 0.

    LOGGER.debug("Pop from read years {}".format(pop.year_id.values))

    # Not FBDPath at the moment since it doesn't recognize covariate as a
    # valid stage. May need to change location of files.
    # srb etl (no draws)
    srp_path = FBDPath("/{}/past/sex_ratio_at_birth/{}".format(
        gbd_round_id, srb_version))
    srb_file = srp_path / "sex_ratio_at_birth.nc"

    try:
        LOGGER.info("Reading {}".format(srb_file))
        srb = xr.open_dataarray(str(srb_file))
    except OSError as ose:
        LOGGER.error("Cannot open srb {}: {}".format(srb_file, ose))
        exit()

    # Subset to last year of past
    srb = srb.sel(year_id=years.past_end)

    return asfr_lim, lifetable_out, pop, migration_locs_fixed, srb
コード例 #13
0
def agreement_rules(asfr, lifetable, pop, migration, srb, years):
    """
    This is where we put all rules for how data from different sources
    must agree with each other in terms of what domain it lives on.

    Args:
        asfr (xr.DataArray): Age-specific fertility rate
        lifetable (tuple): The lifetable as a tuple of datasets, past and future
        pop (xr.DataArray): Population for starting year
        migration (xr.DataArray): Migration values
        srb (xr.DataArray): Sex ratio at birth
        years (YearRange): years for past and forecast.

    Returns:
        xr.DataArray: ASFR
        xr.Dataset: Lifetable
        xr.DataArray: First year of population
        xr.DataArray: Migration
    """
    # There could be a separate past and future lifetable, so we take part
    # in order to determine the subsets. Take the second one, the future.
    part_of_lifetable = lifetable[-1]

    desired_locations = {
        int(dl)
        for dl in fbd_core.db.get_locations_by_level(3).location_id
    }
    la, ll, lp, lm = [
        set(l.location_id.values)
        for l in [asfr, part_of_lifetable, pop, migration]
    ]
    LOGGER.info(
        "location id count for asfr {} life {} pop {} migration {}".format(
            len(la), len(ll), len(lp), len(lm)))
    loc_all = la & ll & lp  # Missing migration will be set to 0.
    if desired_locations - la:
        LOGGER.warning("asfr locations missing {}".format(desired_locations -
                                                          la))
    if desired_locations - ll:
        LOGGER.warning(
            "lifetable locations missing {}".format(desired_locations - ll))
    if desired_locations - lp:
        LOGGER.warning(
            "population locations missing {}".format(desired_locations - lp))
    if desired_locations - lm:
        LOGGER.warning(
            "migration locations missing {}".format(desired_locations - lm))
    subset = dict(location_id=np.array(list(sorted(loc_all)), dtype=np.int))

    subset["sex_id"] = [1, 2]

    assert not (set(asfr.age_group_id.values) -
                set(part_of_lifetable.age_group_id.values))

    ages = part_of_lifetable.age_group_id.values[:]
    al = set(part_of_lifetable.age_group_id.values)
    ap = set(pop.age_group_id.values)
    if al ^ ap:
        LOGGER.info("lifetable ages {} pop ages {}".format(al - ap, ap - al))
        ages = [a for a in ages if a in ap]

    assert consistent_age_group_ids(ages),\
        "Ages don't match for ids {}".format(ages)
    subset["age_group_id"] = ages

    pop_sub = pop.loc[subset]
    subset_lives = list()

    if len(lifetable) > 1:  # should be just 2 elements
        min_draw_count = min([ds["draw"].size for ds in lifetable])
        # make sure the subset draw labels are the same
        assert lifetable[0]["draw"][0:min_draw_count].\
            identical(lifetable[1]["draw"][0:min_draw_count])
        subset["draw"] = lifetable[0]["draw"][0:min_draw_count]

    for incoming_life in lifetable:
        LOGGER.debug("life subsets {}".format(incoming_life))
        subset_life = incoming_life.loc[subset]
        if "scenario" not in subset_life.dims:
            d3 = xr.concat([subset_life] * 3, dim="scenario")
            d3.coords["scenario"] = [-1, 0, 1]
            subset_life = d3
        subset_lives.append(subset_life)

    if len(subset_lives) > 1:
        lifetable_lim = xr.concat(subset_lives, dim="year_id")
    else:
        lifetable_lim = subset_lives[0]

    life_sub = lifetable_lim.transpose("location_id", "scenario", "draw",
                                       "year_id", "sex_id", "age_group_id")

    assert consistent_age_group_ids(life_sub.age_group_id.values)
    assert consistent_age_group_ids(pop_sub.age_group_id.values)

    # Migration will be missing locations, most likely, and may have extras.
    # Migration will have strict five-year age groups.
    # This version creates a dataset of zeros. We should give missing countries
    # a value that is the average over the region.
    migration_years = migration.year_id.values
    migration_years = migration_years[migration_years >= years.past_end]

    if np.in1d(subset["age_group_id"], migration.age_group_id.values).all():
        migration_ages = subset["age_group_id"]
        LOGGER.info("migration using GBD age groups")
    else:
        migration_ages = migration.age_group_id.values
        LOGGER.info("migration using the age groups it has")

    migration_sub = xr.DataArray(
        data=np.zeros((len(subset["location_id"]), len(migration_years), 2,
                       len(migration_ages)),
                      dtype=np.double),
        coords=dict(location_id=subset["location_id"],
                    year_id=migration_years,
                    sex_id=[1, 2],
                    age_group_id=migration_ages),
        dims=["location_id", "year_id", "sex_id", "age_group_id"])

    common_locations = [
        l for l in subset["location_id"] if l in migration.location_id.values
    ]
    copy_idx = dict(location_id=common_locations,
                    year_id=migration_years,
                    sex_id=[1, 2],
                    age_group_id=migration_ages)
    LOGGER.debug("migration {} migration_sub {}".format(
        migration.age_group_id.values, migration_sub.age_group_id.values))

    if "draw" in migration.dims:
        migration_sub = expand_dimensions(migration_sub,
                                          draw=migration["draw"])
        migration_sub = migration_sub.transpose(*list(migration.dims))
        copy_idx["draw"] = migration["draw"].values.tolist()

    for index_obj in copy_idx.values():
        assert len(set(index_obj)) == len(index_obj)
    for name, coord in migration.coords.items():
        assert len(set(coord.values)) == len(coord.values), name

    LOGGER.debug("migration {}".format(migration))

    migration_sub.loc[copy_idx] = migration.loc[copy_idx]

    if 1 in migration_sub.age_group_id.values:
        LOGGER.info("Migration WPP Straight")
        from_under_five = copy_idx.copy()
        from_under_five["age_group_id"] = [1]
        five_year = 1 / (5 * 365)
        for aid, frac in [(2, 7 * five_year), (3, 21 * five_year),
                          (4, 337 * five_year), (5, 4 * 365 * five_year)]:
            copy_idx["age_group_id"] = [aid]
            migration_sub.loc[copy_idx] = frac * migration.loc[from_under_five]
        migration_sized = migration_sub
    else:
        LOGGER.info("Migration WPP Smoothed")
        # It's the one-year migration.

        # NOTE these age groups were particular of the migration file provided
        migration_age_ids = migration_sub['age_group_id'].values
        end_ages_0 = [143, 144, 145, 146, 273]  # 95, 96, 97, 98, 99+
        end_ages_1 = [235]  # 95+
        if set(end_ages_0).issubset(migration_age_ids):
            assert not set(end_ages_1) & set(migration_age_ids)
            end_ages = end_ages_0
            early_ages = [
                x for x in migration_sub.age_group_id.values
                if x not in end_ages
            ]
            # sum over these granular age groups to form one terminal age group
            end_years = migration_sub.loc[{
                "age_group_id": end_ages
            }].sum(dim="age_group_id")
            end_years.coords["age_group_id"] = 235
            end_years.expand_dims("age_group_id", axis=len(end_years.dims))
            lop_end = migration_sub.loc[{"age_group_id": early_ages}]
            migration_sized = xr.concat([lop_end, end_years],
                                        dim="age_group_id")
        # if 235 is the only age group id beyond 142, no change is needed
        elif set(end_ages_1).issubset(migration_age_ids):
            assert not set(end_ages_0) & set(migration_age_ids)
            migration_sized = migration_sub
        else:
            raise Exception("end_ages do not exists in migration age_group_id")

    ordered =\
        consistent_age_group_ids(migration_sized.age_group_id.values)
    if not ordered:
        raise RuntimeError("Age group ids not ordered")

    subset.pop("sex_id")
    subset.pop("age_group_id")
    asfr_sub = asfr.loc[subset]
    # We assert order because the internal methods will use numpy arrays.
    # Locations are first because we will parallelize over locations.
    # The year and age group are last because we work by draw, so they
    # are needed for each forecast.
    asfr_sub = asfr_sub.transpose("location_id", "scenario", "draw", "year_id",
                                  "age_group_id")

    idx = 0
    for year_arr in [asfr_sub, life_sub, migration_sized]:
        assert year_arr.year_id.values[0] == years.past_end, (
            "Start is {} years for {} are {}".format(years.past_end, idx,
                                                     year_arr.year_id.values))
        idx += 1

    LOGGER.debug("Pop from agree years {}".format(pop_sub.year_id.values))
    LOGGER.debug("Life from agree years {}".format(life_sub.year_id.values))
    check_out_of_bounds(asfr_sub, "asfr")
    check_out_of_bounds(life_sub.lx, "life")
    check_out_of_bounds(pop_sub, "pop_in")
    check_out_of_bounds(migration_sub, "migration")

    return asfr_sub, life_sub, pop_sub, migration_sized, srb
コード例 #14
0
def print_pop_stats(pop_base_version, pop_sdg_version, pop_99_version,
                    pop_past_version, gbd_round_id, measure):

    pop_base_draw_path = FBDPath(
        f"{gbd_round_id}/future/{measure}/{pop_base_version}")
    pop_sdg_draw_path = FBDPath(
        f"{gbd_round_id}/future/{measure}/{pop_sdg_version}")
    pop_99_draw_path = FBDPath(
        f"{gbd_round_id}/future/{measure}/{pop_99_version}")
    pop_base_path = FBDPath(
        f"{gbd_round_id}/future/{measure}/{pop_base_version}_combined")
    pop_sdg_path = FBDPath(
        f"{gbd_round_id}/future/{measure}/{pop_sdg_version}_combined")
    pop_99_path = FBDPath(
        f"{gbd_round_id}/future/{measure}/{pop_99_version}_combined")
    pop_past_path = FBDPath(
        f"{gbd_round_id}/past/{measure}/{pop_past_version}")

    pop_past = open_xr(f"{pop_past_path}/{measure}_agg.nc").data
    pop_draw_base = open_xr(f"{pop_base_draw_path}/{measure}_agg.nc").data
    pop_draw_sdg = open_xr(f"{pop_sdg_draw_path}/{measure}_agg.nc").data
    pop_draw_99 = open_xr(f"{pop_99_draw_path}/{measure}_agg.nc").data
    pop_base = open_xr(f"{pop_base_path}/{measure}_combined.nc").data
    pop_sdg = open_xr(f"{pop_sdg_path}/{measure}_combined.nc").data
    pop_99 = open_xr(f"{pop_99_path}/{measure}_combined.nc").data

    pop_slower_2100 = pop_base.sel(
        scenario=-1, sex_id=3, location_id=1, age_group_id=22,
        year_id=2100) / 1e9
    pop_sdg_2100 = pop_sdg.sel(
        scenario=-1, sex_id=3, location_id=1, age_group_id=22,
        year_id=2100) / 1e9
    pop_faster_2100 = pop_base.sel(
        scenario=1, sex_id=3, location_id=1, age_group_id=22,
        year_id=2100) / 1e9
    pop_fastest_2100 = pop_99.sel(
        scenario=1, sex_id=3, location_id=1, age_group_id=22,
        year_id=2100) / 1e9
    pop_ref_2100 = pop_base.sel(
        scenario=0, sex_id=3, location_id=1, age_group_id=22,
        year_id=2100) / 1e9
    pop_ref = pop_base.sel(
        scenario=0, sex_id=3, location_id=1, age_group_id=22) / 1e9

    pop_sdg_draw_2100 = pop_draw_sdg.sel(
        scenario=-1, sex_id=3, location_id=1, age_group_id=22,
        year_id=2100) / 1e9
    pop_slower_draw_2100 = pop_draw_base.sel(
        scenario=-1, sex_id=3, location_id=1, age_group_id=22,
        year_id=2100) / 1e9
    pop_fastest_draw_2100 = pop_draw_99.sel(
        scenario=1, sex_id=3, location_id=1, age_group_id=22,
        year_id=2100) / 1e9

    ref_peak_year = find_peak_year(pop_ref)
    ref_peak_da = pop_ref.sel(year_id=ref_peak_year)

    ref_peak_mean, ref_peak_lower, ref_peak_upper = \
    return_mean_and_quantiles(ref_peak_da)

    ref_2100_mean, ref_2100_lower, ref_2100_upper = \
    return_mean_and_quantiles(pop_ref_2100)

    slower_2100_mean, slower_2100_lower, slower_2100_upper = \
    return_mean_and_quantiles(pop_slower_2100)

    sdg_2100_mean, sdg_2100_lower, sdg_2100_upper = \
    return_mean_and_quantiles(pop_sdg_2100)

    faster_2100_mean, faster_2100_lower, faster_2100_upper = \
    return_mean_and_quantiles(pop_faster_2100)

    fastest_2100_mean, fastest_2100_lower, fastest_2100_upper = \
    return_mean_and_quantiles(pop_fastest_2100)

    if sdg_2100_mean < fastest_2100_mean:
        diff_mean, diff_lower, diff_upper = calculate_diff(
            pop_slower_draw_2100, pop_sdg_draw_2100)
    else:
        diff_mean, diff_lower, diff_upper = calculate_diff(
            pop_slower_draw_2100, pop_fastest_draw_2100)

    median_age_2017 = get_median_age(pop_past, gbd_round_id, 2017)
    median_age_2100_mean, median_age_2100_lower, median_age_2100_upper = \
    get_median_age(pop_draw_base, gbd_round_id, 2100)

    pop_under5_2017 = pop_past.sel(
        sex_id=3, location_id=1, age_group_id=1, year_id=2017) / 1e6
    pop_under5_2100 = pop_base.sel(
        scenario=0, sex_id=3, location_id=1, age_group_id=1,
        year_id=2100) / 1e6
    pop_over80_2017 = pop_past.sel(
        sex_id=3, location_id=1, age_group_id=21, year_id=2017) / 1e6
    pop_over80_2100 = pop_base.sel(
        scenario=0, sex_id=3, location_id=1, age_group_id=21,
        year_id=2100) / 1e6

    age_under5_2017 = pop_under5_2017.values.round(2)
    age_under5_2100_mean, age_under5_2100_lower, age_under5_2100_upper = \
    return_mean_and_quantiles(pop_under5_2100)
    age_over80_2017 = pop_over80_2017.values.round(2)
    age_over80_2100_mean, age_over80_2100_lower, age_over80_2100_upper = \
    return_mean_and_quantiles(pop_over80_2100)

    pop_under5_2017_draws = expand_dimensions(pop_under5_2017,
                                              draw=range(1000))
    pop_base_draw_under5_2100 = pop_draw_base.sel(
        scenario=0, sex_id=3, location_id=1, age_group_id=1,
        year_id=2100).squeeze().drop("scenario") / 1e6
    percent_drop = (pop_under5_2017_draws -
                    pop_base_draw_under5_2100) / pop_under5_2017_draws
    drop_percent_mean = (percent_drop.mean("draw") * 100).values.round(2)
    drop_percent_lower = (percent_drop.quantile(0.025, dim="draw") *
                          100).values.round(2)
    drop_percent_upper = (percent_drop.quantile(0.975, dim="draw") *
                          100).values.round(2)

    print(
        f"Combining the scenarios for mortality, fertility, and migration, "
        f"we expect global population in the reference scenario to peak at "
        f"{ref_peak_mean} (95% UI {ref_peak_lower}-{ref_peak_upper}) billion "
        f"in the year {ref_peak_year} and then decline to {ref_2100_mean} "
        f"({ref_2100_lower}-{ref_2100_upper}) billion in 2100. "
        f"Across alternative scenarios, the range in 2100 is from "
        f"{slower_2100_mean} billion "
        f"({slower_2100_lower}–{slower_2100_upper}) in the slower met need "
        f"and education scenario to {sdg_2100_mean} billion "
        f"({sdg_2100_lower}-{sdg_2100_upper}) in the SDG-pace scenario for "
        f"education and contraceptive met need (figure 6)."
        f"The faster and fastest scenarios give 2100 global populations of "
        f"{faster_2100_mean} ({faster_2100_lower}-{faster_2100_upper}) and "
        f"{fastest_2100_mean} ({fastest_2100_lower}-{fastest_2100_upper}) "
        f"billion, respectively. Peak population in the SDG scenario is in "
        f"2046, while the global population continues to grow through the "
        f"century in the slower scenario. The huge differences in TFR in "
        f"2100 across the scenarios translates into differences of {diff_mean} "
        f"({diff_lower}-{diff_upper}) billion people in 2100. "
        f"Median age will increase in the reference scenario from "
        f"{median_age_2017} in 2017 to {median_age_2100_mean} "
        f"({median_age_2100_lower}-{median_age_2100_upper}) in 2100 "
        f"The number of children under age 5 will decline from {age_under5_2017} "
        f"million in 2017 to only {age_under5_2100_mean} "
        f"({age_under5_2100_lower}-{age_under5_2100_upper}) in 2100, a drop "
        f"of {drop_percent_mean}% "
        f"({drop_percent_lower}-{drop_percent_upper}) million. At the same "
        f"time, the number of individuals aged over 80 will will increase "
        f"from {age_over80_2017} million in 2017 to {age_over80_2100_mean} "
        f"({age_over80_2100_lower}-{age_over80_2100_upper}) in 2100.")
コード例 #15
0
def arc_forecast_education(past,
                           gbd_round_id,
                           transform,
                           weight_exp,
                           years,
                           reference_scenario,
                           diff_over_mean,
                           truncate,
                           truncate_quantiles,
                           replace_with_mean,
                           extra_dim=None):
    """Forecasts education using the ARC method.

    Args:
        past (xarray.DataArray):
            Past data with dimensions ``location_id``, ``sex_id``,
            ``age_group_id``, ``year_id``, and ``draw``.
        transform (xarray.DataArray):
            Space to transform education to for forecasting.
        weight_exp (float):
            How much to weight years based on recency
        years (YearRange):
            Forecasting timeseries.
        reference_scenario (str):
            If 'median' then the reference scenarios is made using the
            weighted median of past annualized rate-of-change across all
            past years, 'mean' then it is made using the weighted mean of
            past annualized rate-of-change across all past years
        diff_over_mean (bool):
            If True, then take annual differences for means-of-draws, instead
            of draws.
        truncate (bool):
            If True, then truncates the dataarray over the given dimensions.
        truncate_quantiles (object, optional):
            The tuple of two floats representing the quantiles to take
        replace_with_mean (bool, optional):
            If True and `truncate` is True, then replace values outside of the
            upper and lower quantiles taken across "location_id" and "year_id"
            and with the mean across "year_id", if False, then replace with the
            upper and lower bounds themselves.
        gbd_round_id (int):
            The GBD round of the input data.
    Returns:
        (xarray.DataArray):
            Education forecasts
    """
    LOGGER.debug("diff_over_mean:{}".format(diff_over_mean))
    LOGGER.debug("truncate:{}".format(truncate))
    LOGGER.debug("truncate_quantiles:{}".format(truncate_quantiles))
    LOGGER.debug("replace_with_mean:{}".format(replace_with_mean))
    LOGGER.debug("reference_scenario:{}".format(reference_scenario))

    most_detailed_coords = _get_avail_most_detailed_coords(past, gbd_round_id)
    most_detailed_past = past.sel(**most_detailed_coords)

    zeros_dropped = most_detailed_past.where(most_detailed_past > 0)
    for dim in zeros_dropped.dims:
        zeros_dropped = zeros_dropped.dropna(dim=dim, how="all")

    LOGGER.debug("Transforming the past to {}-space".format(transform))
    transformed_past = TRANSFORMATIONS[transform](zeros_dropped)

    LOGGER.debug("Forecasting education in the transformed space")
    transformed_forecast = scenarios.arc_method(
        transformed_past,
        gbd_round_id=gbd_round_id,
        years=years,
        reference_scenario=reference_scenario,
        weight_exp=weight_exp,
        diff_over_mean=diff_over_mean,
        truncate=truncate,
        truncate_quantiles=truncate_quantiles,
        replace_with_mean=replace_with_mean,
        reverse_scenarios=True,
        extra_dim=extra_dim,
        scenario_roc="national")

    LOGGER.debug("Converting the forecasts to normal/identity space")
    forecast = INVERSE_TRANSFORMATIONS[transform](transformed_forecast)

    refilled_forecast = etl.expand_dimensions(forecast, **most_detailed_coords)
    lagged_scenarios = lag_scenarios(refilled_forecast, years)

    # Since past does get clipped to avoid infs and negative infs, we need to
    # append the actual past onto the data being saved (modelers currently
    # expect the past to be there)
    past_broadcast_scenarios = etl.expand_dimensions(
        most_detailed_past, scenario=lagged_scenarios["scenario"])
    all_data = past_broadcast_scenarios.combine_first(lagged_scenarios)

    bound_err_msg = "the forecasts have NaNs"
    assert not np.isnan(all_data).any(), bound_err_msg
    if np.isnan(all_data).any():
        LOGGER.error(bound_err_msg)
        raise RuntimeError(bound_err_msg)

    return all_data