Python resample Examples, fbd_core.etl.resample Python Examples

Example #1

0

Show file

File: driver.py Project: ihmeuw/fhs_2019_population_paper

def read_lifetable(gbd_round_id, lifetable_version, draws):
    """

    Args:
        gbd_round_id (int):
        lifetable_version (str): Of the form "past/versionname"
           or "future/versionname"
        draws (int): desired number of draws

    Returns:
        The life table.
    """
    if "/" in lifetable_version:
        past_or_future, version = lifetable_version.split("/")
    else:
        past_or_future = "future"
        version = lifetable_version

    # lifetable from the future includes last year of the past.
    lifetable_file = FBDPath("/{}/{}/life_expectancy/{}".format(
        gbd_round_id, past_or_future, version)) / "lifetable_ds.nc"
    try:
        LOGGER.info("Reading {}".format(lifetable_file))
        lifetable = xr.open_dataset(str(lifetable_file))
    except OSError as ose:
        LOGGER.error("Cannot open lifetable {}: {}".format(
            lifetable_file, ose))
        exit()

    if "draw" in lifetable.dims:
        lifetable = resample(lifetable.sortby("draw"), draws)

    return _drop_point_coordinates(lifetable)

Example #2

0

Show file

def load_forecast_pop(gbd_round_id, version, years, draws):
    """
    Load forecast population data. Aggregates if necessary.

    Args:
        gbd_round_id (int):
            The gbd round ID that the past population is from
        version (str):
            The version of forecast population to read from
        years (YearRange):
            The Forecasting format years to use.

    Returns:
        xarray.DataArray: The past population xarray dataarray
    """
    forecast_pop_dir = FBDPath(f"/{gbd_round_id}/future/population/{version}")
    try:
        forecast_pop_path = forecast_pop_dir / "population_agg.nc"
        forecast_pop_da = open_xr(forecast_pop_path).data
    except:  # Need to make agg version
        forecast_pop_path = forecast_pop_dir / "population.nc"
        forecast_pop_da = open_xr(forecast_pop_path).data
        forecast_pop_da = Aggregator.aggregate_everything(
            forecast_pop_da, gbd_round_id).pop
        forecast_pop_out_path = forecast_pop_dir / "population_agg.nc"
        save_xr(forecast_pop_da,
                forecast_pop_out_path,
                metric="number",
                space="identity")

    # slice to correct years and number of draws
    forecast_pop_da = forecast_pop_da.sel(year_id=years.forecast_years)
    forecast_pop_da = resample(forecast_pop_da, draws)

    return forecast_pop_da

Example #3

0

Show file

File: compute_births_deaths.py Project: ihmeuw/fhs_2019_population_paper

def concat_past_future(past_da, forecast_da, draws, years):
    """Combine past at last past year and future data.

    Args:
        past_da (xarray.DataArray):
            Past data.
        forecast_da (xarray.DataArray):
            Forecast data.
        gbd_round_id (int):
            The GBD round fed into FBDPath.
        draws (int):
            Number of draws.
    Returns: (xarray.DataArray):
        Combined past and future data.
    """
    past_da = past_da.sel(year_id=years.past_end,
                          age_group_id=forecast_da.age_group_id.values,
                          location_id=forecast_da.location_id.values)
    forecast_da = forecast_da.sel(year_id=years.forecast_years,
                                  scenario=0).drop("scenario")
    forecast_da = resample(forecast_da, draws)
    past_da = resample(past_da, draws)
    combined_da = xr.concat([past_da, forecast_da], dim="year_id")
    return combined_da

Example #4

0

Show file

def read_sev(rei, sev, vaccine_sev, gbd_round_id, years, draws):
    """
    Reads in SEV for vaccine.

    Args:
        rei (str): risk, could also be vaccine intervention.
        gbd_round_id (int): gbd round id
        sev (str): upstrem sev version
        vaccine_sev (str): upstream vaccine sev version.
        gbd_round_id (int): gbd round id.
        years (YearRange): [past_start, forecast_start, forecast_end] years.
        draws (int): number of draws for output file.  This means input files
            will be up/down-sampled to meet this criterion.

    Returns:
        (xr.DataArray): SEV in dataarray form.
    """
    if rei in get_vaccine_reis(gbd_round_id):  # vaccine treated as anti-risk
        infile_fbd_path =\
            FBDPath(gbd_round_id=gbd_round_id,
                    past_or_future="future",
                    stage="vaccine",
                    version=vaccine_sev) / (rei + "_new_ref.nc")
        out = 1.0 - open_xr(infile_fbd_path).data  # anti-risk
    else:
        infile_fbd_path =\
            FBDPath(gbd_round_id=gbd_round_id,
                    past_or_future="future",
                    stage="sev",
                    version=sev) / (rei + ".nc")
        out = open_xr(infile_fbd_path).data

    out = conditionally_triggered_transformations(out, gbd_round_id, years)
    if len(out["draw"]) != draws:
        out = resample(out, draws)
    return out

Example #5

0

Show file

def get_gbd_paf(acause, rei, cache_version, gbd_round_id, sex_ids,
                location_ids, draws, measure_id=4, metric_id=2):
    """
    Downloads and transforms gbd cause-risk-specific PAF.  The dataarray
    is then cleaned and saved in a FBDPath.

    The gbd paf coming from get_draws::
        >>> df.columns
        Index([u'rei_id', u'modelable_entity_id', u'location_id', u'year_id',
               u'age_group_id', u'sex_id', u'cause_id', u'measure_id',
               u'draw_0', u'draw_1', ... u'draw_991', u'draw_992', u'draw_993',
               u'draw_994', u'draw_995', u'draw_996', u'draw_997', u'draw_998',
               u'draw_999', u'metric_id'], dtype='object', length=1009)

    where we will need to
    1.) use cause_id to slice for the cause-risk pair
    2.) use measure_id (typically 4 for yll) to slice for measure_id
    3.) use metric_id (typically 2 for percent) to slice for metric_id

    Args:
        acause (str): analytical cause.
        rei (str): risk, could also be vaccine intervention.
        cache_version (str): the FBDPath paf version to save the gbd paf in,
            or to read from.
        gbd_round_id (int): gbd round id
        sex_ids (list): sexes.  Typically [1, 2].
        location_ids (list): locations to get pafs from.
        draws (int): number of draws for output file.  This means input files
            will be up/down-sampled to meet this criterion.
        measure_id (int, optional): typically the yll measure id (4).  At the
            most detailed PAF yll is equivalent to death, so measure_id 4 works
            the same as measure_id 1 (death).  Empirically, it seems to pull
            data faster if calling with meausre_id=4.
        metric_id (int, optional): typically the percent metric (2)

    Returns:
        (xr.DataArray/None): Dataarray with complete demographic indices,
            sans "scenario".
    """
    if rei in get_vaccine_reis(gbd_round_id):
        # get_draws won't have anything for vaccines
        return None

    cache_file_fbdpath =\
        FBDPath(gbd_round_id=gbd_round_id,
                past_or_future="past",
                stage="paf",
                version=cache_version) / (acause + "_" + rei + ".nc")

    if cache_file_fbdpath.exists():

        LOGGER.info("{} already exists.  Will read from it for gbd paf.".
                    format(cache_file_fbdpath))

        paf_da = open_xr(cache_file_fbdpath).data

        paf_da = paf_da.sel(location_id=location_ids)

        if len(paf_da["draw"]) != draws:
            paf_da = resample(paf_da, draws)

        return paf_da

    else:  # no cache exists, must download & clean
        rei_id = get_rei_id(rei)

        if acause in CAUSES_NOT_IN_GBD_MAP:  # edge case for diarrhea_*
            cause_id = get_cause_id(CAUSES_NOT_IN_GBD_MAP[acause])
        else:
            cause_id = get_cause_id(acause)

        gbd_round = get_gbd_round(gbd_round_id)

        try:
            # we only need it for year_id=gbd_round, but for every other dim
            # we collect everything.
            paf_df = get_draws(gbd_id_type=['cause_id', 'rei_id'],
                               gbd_id=[cause_id, rei_id],
                               source='burdenator',
                               year_id=gbd_round,
                               gbd_round_id=gbd_round_id,
                               measure_id=measure_id,
                               metric_id=metric_id)
        except Exception as exc:
            error_message = "Error in get_draws for {}_{}".format(acause, rei)
            LOGGER.error(error_message)
            raise IOError(str(exc))

        paf_df = paf_df.drop(columns=["year_id",
                                      "rei_id",
                                      "cause_id",
                                      "measure_id",
                                      "metric_id"])  # don't need these no more

        paf_da = df_to_xr(paf_df,
                          dims=["location_id", "age_group_id", "sex_id"],
                          wide_dim_name='draw',
                          wide_dim_transform=lambda x: int(x.split('_')[1]),
                          fill_value=np.nan)

        paf_da = paf_da.sortby("draw")  # draws don't always come in sorted

        paf_da = _data_cleaning_for_paf(paf_da, acause, rei, "GBD")

        LOGGER.info("Saving downloaded & cleaned {}".
                    format(cache_file_fbdpath))

        save_xr(paf_da, cache_file_fbdpath, metric="percent", space="identity",
                cause_id=cause_id, rei_id=rei_id, gbd_round_id=gbd_round_id,
                year_id=gbd_round, measure_id=measure_id, metric_id=metric_id,
                upper_bound=PAF_UPPER_BOUND, lower_bound=PAF_LOWER_BOUND)

    if len(paf_da["draw"]) != draws:
        paf_da = resample(paf_da, draws)

    return paf_da

Example #6

0

Show file

def read_rrmax(acause, rei, rrmax, vaccine_rrmax, gbd_round_id, years, draws):
    """
    Reads in RRmax for vaccine.

    Args:
        acause (str): analytical cause.
        rei (str): risk, could also be vaccine intervention.
        gbd_round_id (int): gbd round id
        rrmax (str): upstream rrmax version
        vaccine_rrmax (str): upstream vaccine rrmax version.
        gbd_round_id (int): gbd round id.
        years (YearRange): [past_start, forecast_start, forecast_end] years.
        draws (int): number of draws for output file.  This means input files
            will be up/down-sampled to meet this criterion.

    Returns:
        (xr.DataArray): vaccine RRmax in dataarray form.
    """
    if rei in get_vaccine_reis(gbd_round_id):
        # The values stored in these data files are actually not RR, but rather
        # r = Incidence[infection | vax] / Incidence[infection | no vax],
        # interpreted as "percent reduction of diseased cases if vaccinated",
        # and should be r < 1.
        # We compute the actual RR as 1/r.
        infile_fbd_path =\
            FBDPath(gbd_round_id=gbd_round_id,
                    past_or_future="future",
                    stage="rrmax",
                    version=vaccine_rrmax) / (rei + ".nc")
    else:
        infile_fbd_path =\
            FBDPath(gbd_round_id=gbd_round_id,
                    past_or_future="past",
                    stage="rrmax",
                    version=rrmax) / "netcdf" / (rei + ".nc")

    cause_id = get_cause_id(acause)

    out = open_xr(infile_fbd_path).data

    if cause_id not in out[CAUSE_DIM].values.tolist():
        error_message = "{} ({}) not in {}'s cause dim: {}".\
                        format(acause, cause_id, infile_fbd_path,
                               out[CAUSE_DIM].values.tolist())
        LOGGER.error(error_message)
        raise KeyError(error_message)
    out = out.loc[{CAUSE_DIM: cause_id}].drop(CAUSE_DIM)
    out = conditionally_triggered_transformations(out, gbd_round_id, years)

    if rei in get_vaccine_reis(gbd_round_id):
        # NOTE if we switch raw data source to burdenator, this algo might
        # need to change.
        # As mentioned above, this value for vaccine should be < 1.
        # Any value > 1 should be capped.
        out = out.where(out <= PAF_UPPER_BOUND).fillna(PAF_UPPER_BOUND)
        out = 1.0 / out  # as mentioned earlier, we compute RR as 1/r.

    if len(out["draw"]) != draws:
        out = resample(out, draws)

    # NOTE some rrmax cell values could be 0, for reasons unclear.
    return out

Example #7

0

Show file

File: driver.py Project: ihmeuw/fhs_2019_population_paper

def read_datasets(asfr_version, gbd_round_id, lifetable_version, pop_version,
                  migration_version, years, srb_version, draws):
    """
    This reads files, orders their axes, and ensures that data arrays
    aren't presented as datasets. This enforces rules about how many
    files get read, how they are found, and how they are assembled into
    the incoming data. It doesn't address what the data means.

    Args:
        asfr_version (str): Version string for ASFR
        gbd_round_id (int): GBD Round as an integer
        lifetable_version (list[str]): Lifetable version
        pop_version (str): Population start version
        migration_version (list[str]): Migration version
        years (YearRange): years for past and forecast
        srb_version (str): sex ratio at birth version
        draws (int): the number of draws to take from the future versions.

    Returns:
        xr.DataArray: ASFR
        tuple: Either one lifetable file or (past, futue).
        xr.DataArray: Starting population
        xr.DataArray: Migration
        xr.DataArray: SRB
    """
    # Do this in a subroutine so it's memory can be released.
    # pop etl (pop version is in the past)
    data_read_start = perf_time()
    pop_file = FBDPath("/{}/past/population/{}".format(
        gbd_round_id, pop_version)) / "population.nc"
    try:
        LOGGER.info("Reading {}".format(pop_file))
        pop = xr.open_dataarray(str(pop_file))
        # if there's a draw dimension, take the mean
        if "draw" in pop.dims:
            pop = pop.mean("draw")
    except OSError as ose:
        LOGGER.error("Cannot open pop {}: {}".format(pop_file, ose))
        exit()

    # we may or may not have draws for past pops, but we should certainly
    # expect location, age, sex, and year
    assert {"location_id", "year_id", "age_group_id",
            "sex_id"}.issubset(set(pop.dims))
    if len(pop.year_id) > 1:
        pop = pop.loc[{"year_id": years.past_end}]
    else:
        pop = pop.squeeze(dim="year_id")
        assert pop.year_id == years.past_end
    LOGGER.debug("pop {}".format(pop))

    # we like age_group_id to be the last dim to expedite later computation.
    if "draw" in pop.dims:  # if past pop has draws, resample.
        pop = pop.transpose("draw", "location_id", "sex_id", "age_group_id")
        pop = resample(pop, draws)
    else:
        pop = pop.transpose("location_id", "sex_id", "age_group_id")

    if pop.name is None:
        pop.name = "population"

    # asfr etl (draws expected)

    asfr_gbd_round_id = gbd_round_id if gbd_round_id >= 5 else 5
    asfr_file = FBDPath("/{}/future/asfr/{}".format(asfr_gbd_round_id,
                                                    asfr_version)) / "asfr.nc"
    try:
        LOGGER.info("Reading {}".format(asfr_file))
        # ASFR is reported per thousand people.
        asfr = xr.open_dataarray(str(asfr_file))
    except OSError as ose:
        LOGGER.error("Cannot open asfr {}: {}".format(asfr_file, ose))
        #exit(2

    assert set(asfr.dims) == {
        "draw", "year_id", "location_id", "scenario", "age_group_id"
    }, "asfr dims {}".format(asfr.dims)
    asfr_lim = asfr.sel(year_id=slice(years.past_end, years.forecast_end + 1))
    if asfr_lim.name is None:
        asfr_lim.name = "asfr"

    asfr_lim = resample(asfr_lim, draws)

    # lifetable etl (draws expected)
    lifetables = list()
    for lfilename in lifetable_version:
        lifetables.append(read_lifetable(gbd_round_id, lfilename, draws))
    if len(lifetables) > 1:
        lpast, lfuture = (None, None)
        lyears = [llx.year_id.values for llx in lifetables]
        if lyears[0][-1] > lyears[1][-1]:
            lfuture, lpast = lifetables
        elif lyears[1][-1] > lyears[0][-1]:
            lpast, lfuture = lifetables
        elif lyears[0][0] < lyears[1][0]:
            lpast, lfuture = lifetables
        elif lyears[1][0] < lyears[0][0]:
            lfuture, lpast = lifetables
        else:
            LOGGER.error("Cannot figure out which is the future lifetable")
            exit()

        if years.past_end in lfuture.year_id.values:
            LOGGER.info("All needed years were in the future lifetable"
                        "Ignoring the past data.")
            lifetable_lim = lfuture.sel(
                year_id=slice(years.past_end, years.forecast_end + 1))
            lifetable_out = (lifetable_lim, )
        else:
            assert years.past_end in lpast.year_id.values
            past_slice = lpast.loc[{"year_id": [years.past_end]}]
            LOGGER.debug("Life past slice {}".format(
                past_slice.year_id.values))
            LOGGER.debug("Life future slice {}".format(lfuture.year_id.values))
            lifetable_out = (past_slice, lfuture)
    else:
        lifetable_lim = lifetables[0].sel(
            year_id=slice(years.past_end, years.forecast_end + 1))

        lifetable_out = (lifetable_lim, )

    # migration etl (no draws expected)
    try:
        migration_file = FBDPath("/{}/future/migration/{}".format(
            gbd_round_id, migration_version[0])) / "migration.nc"
    except Exception:
        if os.path.exists(migration_version[0]):
            migration_file = migration_version[0]
        else:
            raise Exception("Cannot construct {}".format(migration_file))

    try:
        LOGGER.info("Reading {}".format(migration_file))
        migration = xr.open_dataarray(str(migration_file))
    except OSError as ose:
        LOGGER.error("Cannot open migration {}: {}".format(
            migration_file, ose))
        exit()
    assert set(("location_id", "age_group_id", "sex_id", "year_id")).\
           issubset(migration.dims)

    # Currently we don't use or make migration scenarios -- if a scenario dim
    # exists for some reason ensure that only reference is used and that the
    # scenario dim is dropped.
    if "scenario" in migration.dims:  # scenario dim
        migration = migration.sel(scenario=0, drop=True)
    elif "scenario" in migration.coords:  # scenario point coord
        migration = migration.drop("scenario")
    else:
        pass  # no scenario dim or point coord

    # if pop has draws, we want migration to have draws as well.
    # this becomes important in _fill_missing_locations().
    if "draw" in pop.dims:
        if "draw" not in migration.dims:
            migration = expand_dimensions(migration, draw=pop["draw"])
        else:
            migration = resample(migration, draws)
        migration = migration.transpose("draw", "location_id", "year_id",
                                        "sex_id", "age_group_id")
    else:  # pop has no "draw", so migration doesn't need it either
        if "draw" in migration.dims:
            migration = migration.mean("draw")
        migration = migration.transpose("location_id", "year_id", "sex_id",
                                        "age_group_id")

    if migration.name is None:
        migration.name = "migration"
    # Use the last past year's all age population proportions to compute
    # regional migration averages to fill in missing data.
    migration_locs_fixed = _clean_migration_locations(migration,
                                                      pop.sum("age_group_id"),
                                                      gbd_round_id)

    LOGGER.info("Read data Elapsed {}".format(perf_time() - data_read_start))

    # Migration counts drive small nations to zero population.
    # This is a way to ensure we show the trend of health.
    migration_locs_fixed.loc[dict(
        location_id=list(SMALL_NATIONS_ZERO_MIGRATION.values()))] = 0.

    LOGGER.debug("Pop from read years {}".format(pop.year_id.values))

    # Not FBDPath at the moment since it doesn't recognize covariate as a
    # valid stage. May need to change location of files.
    # srb etl (no draws)
    srp_path = FBDPath("/{}/past/sex_ratio_at_birth/{}".format(
        gbd_round_id, srb_version))
    srb_file = srp_path / "sex_ratio_at_birth.nc"

    try:
        LOGGER.info("Reading {}".format(srb_file))
        srb = xr.open_dataarray(str(srb_file))
    except OSError as ose:
        LOGGER.error("Cannot open srb {}: {}".format(srb_file, ose))
        exit()

    # Subset to last year of past
    srb = srb.sel(year_id=years.past_end)

    return asfr_lim, lifetable_out, pop, migration_locs_fixed, srb

Example #8

0

Show file

File: forecast_education.py Project: ihmeuw/fhs_2019_population_paper

def forecast_edu_main(transform, past_version, forecast_version, pv_version,
                      weight_strategy, gbd_round_id, years, reference_scenario,
                      diff_over_mean, truncate, truncate_quantiles,
                      replace_with_mean, draws, **kwargs):
    LOGGER.debug("weight strategy: {}".format(weight_strategy.__name__))
    pv_path = FBDPath("".format())  # Path removed for security reasons
    rmse = open_xr(pv_path / "education_arc_weight_rmse.nc").data
    weight_exp = weight_strategy(rmse, draws)
    LOGGER.info("omega selected: {}".format(weight_exp))

    LOGGER.debug("Reading in the past")
    past_path = FBDPath("".format())  # Path removed for security reasons
    past = resample(open_xr(past_path / "education.nc").data, draws)
    past = past.sel(year_id=years.past_years)

    if isinstance(weight_exp, float) or isinstance(weight_exp, int):
        extra_dim = None
    else:
        if not isinstance(weight_exp, xr.DataArray):
            omega_exp_err_msg = (
                "`omega` must be either a float, an int, or an "
                "xarray.DataArray")
            LOGGER.error(omega_exp_err_msg)
            raise RuntimeError(omega_exp_err_msg)
        elif len(weight_exp.dims) != 1 or "draw" not in weight_exp.dims:
            omega_exp_err_msg = (
                "If `omega` is a xarray.DataArray, then it must have only "
                "1 dim, `draw`")
            LOGGER.error(omega_exp_err_msg)
            raise RuntimeError(omega_exp_err_msg)
        elif not weight_exp["draw"].equals(past["draw"]):
            omega_err_msg = (
                "If `omega` is a xarray.DataArray, then it's `draw` dim "
                "must have the coordinates as `past`")
            LOGGER.error(omega_err_msg)
            raise RuntimeError(omega_err_msg)
        else:
            extra_dim = "draw"

    forecast = arc_forecast_education(past,
                                      gbd_round_id,
                                      transform,
                                      weight_exp,
                                      years,
                                      reference_scenario,
                                      diff_over_mean,
                                      truncate,
                                      truncate_quantiles,
                                      replace_with_mean,
                                      extra_dim=extra_dim)

    forecast_path = FBDPath("".format())
    if isinstance(weight_exp, xr.DataArray):
        report_omega = float(weight_exp.mean())
    else:
        report_omega = weight_exp
    save_xr(forecast,
            forecast_path / "education.nc",
            metric="number",
            space="identity",
            omega=report_omega,
            omega_strategy=weight_strategy.__name__)
    LOGGER.info("education forecasts have saved")