Python df_to_xr Examples, fbd_core.etl.df_to_xr Python Examples

Example #1

0

Show file

def main(ecode, year_id, sex_id, decomp, version):
    tic = time.time()

    m_draws = get_measures(ecode, year_id, sex_id, decomp, version)
    
    x_inc = etl.df_to_xr(m_draws['incidence'],
                         wide_dim_name='draw',
                         fill_value=np.nan
    )
    x_rem = etl.df_to_xr(m_draws['remission'],
                         wide_dim_name='draw',
                         fill_value=np.nan
    )
    x_emr = etl.df_to_xr(m_draws['emr'],
                         wide_dim_name='draw',
                         fill_value=np.nan
    )

    out_coeff = pd.read_csv(paths.OUTPATIENT_COEFFICIENTS / f"{ecode}.csv")
    age_to_out_coeff = dict(zip(out_coeff.age_group_id, out_coeff.out_coeff))

    sys.stdout.flush()
    adjusted_inc = calculate_measures.short_term_incidence_unsplit(
        x_inc,
        x_rem,
        x_emr,
        age_to_out_coeff
    )

    write_results(adjusted_inc, ecode, decomp, version, year_id, sex_id)
    toc = time.time()
    total = toc - tic

Example #2

0

Show file

File: age_sex_split.py Project: ihmeuw/fhs_2019_population_paper

def create_age_sex_xarray():
    LOGGER.debug("Creating xarray of age-sex patterns for migration")
    # load patterns
    qatar = pd.read_csv(QATAR_PATTERN)
    eurostat = pd.read_csv(EUROSTAT_PATTERN)
    # convert to xarrays
    qatar = df_to_xr(qatar, dims=PATTERN_ID_VARS)
    eurostat = df_to_xr(eurostat, dims=PATTERN_ID_VARS)
    # create superarray to hold all locs
    all_locs_xr_list = []
    # Put dataframes for each location into a list
    for loc in WPP_LOCATION_IDS:
        if loc in QATAR_LOCS:
            data = qatar
        else:
            data = eurostat
        data = expand_dimensions(data, location_id=[loc])
        all_locs_xr_list.append(data)
    # Concat all locations together
    result = xr.concat(all_locs_xr_list, dim='location_id')
    # Save all locs pattern
    LOGGER.debug("Saving age-sex pattern xarray")
    pattern_dir = FBDPath(f'/{gbd_round_id}/future/migration/'
                          f'{PATTERN_VERSION}')
    pattern_path = pattern_dir / f"combined_age_sex_pattern.nc"
    save_xr(pattern, pattern_path, metric="percent", space="identity")
    LOGGER.debug("Saved age-sex pattern xarray")
    return result

Example #3

0

Show file

def smr(ncode):
    smr = pd.read_csv(os.path.join(paths.INPUT_DIR,
                                   'FILEPATH.csv')).drop('name', axis=1)

    if ncode == "N48":
        smr = smr.loc[smr["ncode"] == "N9"]
    else:
        smr = smr.loc[smr["ncode"] == ncode]
    smr["se"] = (smr["UL"] - smr["LL"]) / 3.92

    smr["ncode"] = ncode

    # generate draws of SMR
    smr.reset_index(
        drop=True, inplace=True
    )  # need to reset index so that the random draws will line up
    np.random.seed(659177)
    smr[help.drawcols()] = pd.DataFrame(
        np.random.normal(smr['SMR'], smr['se'], size=(1000, len(smr))).T)
    smr.drop(['SMR', 'UL', 'LL', 'se'], inplace=True, axis=1)

    smr = help.convert_to_age_group_id(smr, collapsed_0=False)
    smr.set_index(['ncode', 'age_group_id'], inplace=True)
    smr[smr < 1] = 1
    return etl.df_to_xr(smr, wide_dim_name='draw', fill_value=np.nan)

Example #4

0

Show file

def save_mortality(ecode, year_id, sex_id, locs, ages, version):
    cause_id = help.get_cause(ecode)
    draws = gd.get_draws(
        gbd_id_type="cause_id",
        gbd_id=cause_id,
        location_id=locs,
        year_id=year_id,
        sex_id=sex_id,
        age_group_id=ages,
        status="best",
        source="codem",
        gbd_round_id=help.GBD_ROUND
    )

    draws[help.drawcols()] = draws[help.drawcols()].divide(draws['pop'], axis=0)
    draws.drop(['pop', 'envelope', 'cause_id', 'sex_name', 'measure_id', 'metric_id'], axis=1, inplace=True)
    draws.set_index(['location_id','year_id','sex_id','age_group_id'], inplace=True)
    mort = etl.df_to_xr(draws, wide_dim_name='draw', fill_value=np.nan)

    filename = "FILEPATH.nc".format(str(year_id), str(sex_id))
    folder = os.path.join("FILEPATH")
    if not os.path.exists(folder):
        try:
            os.makedirs(folder)
        except OSError as e:
            if e.errno != os.errno.EEXIST:
                raise
            pass
    filepath = os.path.join(folder, filename)
    print("Writing mortality")
    mort.to_netcdf(filepath)

Example #5

0

Show file

def en_matrix(ecode, sex_id, platform):
    if platform == "inpatient":
        short_plat = "inp"
    else:
        short_plat = "otp"
    if inj_info.ECODE_PARENT[ecode] == 'inj_poisoning':
        e = 'inj_poisoning'
    else:
        e = ecode

    en_mat_dir = "FILEPATH"
    matrix = pd.read_csv(os.path.join(en_mat_dir, short_plat, e + ".csv"))

    matrix = matrix.loc[matrix['sex'] == sex_id]
    matrix = help.convert_to_age_group_id(matrix, collapsed_0=True)
    matrix = help.expand_under_1(matrix)

    matrix.drop(['ecode'], inplace=True, axis=1)
    matrix.rename(columns={
        'n_code': 'ncode',
        'inpatient': 'platform',
        'sex': 'sex_id'
    },
                  inplace=True)
    plat_dict = {1: 'inpatient', 0: 'outpatient'}
    matrix['platform'] = matrix['platform'].replace(plat_dict)

    matrix.set_index(
        ['ncode', 'platform', 'high_income', 'sex_id', 'age_group_id'],
        inplace=True)
    x_matr = etl.df_to_xr(matrix, wide_dim_name='draw', fill_value=0)

    return x_matr

Example #6

0

Show file

File: balance_migration.py Project: ihmeuw/fhs_2019_population_paper

def combine_and_save_mig(version):
    """
    Load location csvs of migration files and combine into an xarray dataarray.

    Args:
        version (str):
            The version of migration to combine and save

    Returns:
        xarray.DataArray: The combined migration data xarray dataarray.
    """
    LOGGER.debug("Combining migration csvs to xarray")
    all_locs_xr_list = []
    # Put dataframes for each location into a list
    for loc in WPP_LOCATION_IDS:
        temp = pd.read_csv(f'filepath')
        #temp = temp.set_index(ID_VARS)
        temp = df_to_xr(temp, dims=ID_VARS)
        all_locs_xr_list.append(temp)
    # Concat all locations together
    result = xr.concat(all_locs_xr_list, dim='location_id')

    # Save to forecasting directory
    result.to_netcdf(f'filepath')
    return result

Example #7

0

Show file

def save_mortality(ecode, year_id, sex_id, locs, ages, decomp, version):
    cause_id = help.get_cause(ecode)
    draws = gd.get_draws(
        gbd_id_type="cause_id",
        gbd_id=cause_id,
        location_id=locs,
        year_id=year_id,
        sex_id=sex_id,
        age_group_id=ages,
        status="best",
        source="codem",
        gbd_round_id=help.GBD_ROUND,
        decomp_step=decomp
    )

    draws[help.drawcols()] = draws[help.drawcols()].divide(draws['pop'], axis=0)
    draws.drop(['pop', 'envelope', 'cause_id', 'sex_name', 'measure_id', 'metric_id'], axis=1, inplace=True)
    draws.set_index(['location_id','year_id','sex_id','age_group_id'], inplace=True)
    mort = etl.df_to_xr(draws, wide_dim_name='draw', fill_value=np.nan)

    filename = "mort_{}_{}.nc".format(str(year_id), str(sex_id))
    version = version.rstrip()
    folder = os.path.join(paths.DATA_DIR, decomp, inj_info.ECODE_PARENT[ecode], version, 'mortality_for_shocks')
    if not os.path.exists(folder):
        try:
            os.makedirs(folder)
        except OSError as e:
            if e.errno != os.errno.EEXIST:
                raise
            pass
    filepath = os.path.join(folder, filename)
    mort.to_netcdf(filepath)

Example #8

0

Show file

def main(ecode, year_id, sex_id, version):
    tic = time.time()
    me_id = help.get_me(ecode)
    
    m_draws = get_measures(ecode, me_id, year_id, sex_id, version)
    
    x_inc = etl.df_to_xr(m_draws['incidence'], wide_dim_name='draw', fill_value=np.nan)
    x_rem = etl.df_to_xr(m_draws['remission'], wide_dim_name='draw', fill_value=np.nan)
    x_emr = etl.df_to_xr(m_draws['emr'], wide_dim_name='draw', fill_value=np.nan)
    
    otp_cov = outpatient_cov(me_id, help.drawcols())
    
    adjusted_inc = calculate_measures.short_term_incidence_unsplit(x_inc, x_rem, x_emr, otp_cov)

    write_results(adjusted_inc, ecode, version, year_id, sex_id)
    toc = time.time()
    total = toc - tic
    print("Total time was {} seconds".format(total))

Example #9

0

Show file

def output_to_xarray(gbd_round, out, version_out):
    asfr_path = FBDPath("/{gri}/future/asfr/{version}".format(
        gri=gbd_round, version=version_out))
    dims = ['location_id', 'year_id', 'scenario', 'age_group_id', 'sex_id', 'draw']
    out_xr = df_to_xr(out, dims = dims)
    save_xr(out_xr,
        fbdpath = asfr_path / "asfr.nc",
        metric="rate",
        space="identity",
        version="version",
        model="asfr_adjusted_to_tfr_plus_point1_if_below2")

Example #10

0

Show file

def get_shock_mort(ecode, pops, locs, ages, year_id, sex_id, decomp):
    cause_id = help.get_cause(ecode)

    draws = gd.get_draws(gbd_id_type="cause_id",
                         gbd_id=cause_id,
                         version_id=model_versions[ecode][sex_id],
                         location_id=locs,
                         year_id=year_id,
                         age_group_id=ages,
                         measure_id=1,
                         source="codem",
                         gbd_round_id=help.GBD_ROUND,
                         decomp_step=model_versions['decomp'])

    if ecode == 'inj_war_execution':

        draws = draws.loc[(draws.age_group_id != 2) &
                          (draws.age_group_id != 3), ]

        sub = draws[draws['age_group_id'] == 4]

        oth_cols = [col for col in sub.columns if 'draw_' not in col]
        sub.set_index(oth_cols, inplace=True)
        sub[:] = 0
        sub = sub.reset_index()
        sub['age_group_id'] = 2

        draws = draws.append(sub)

        sub = draws[draws['age_group_id'] == 4]

        oth_cols = [col for col in sub.columns if 'draw_' not in col]
        sub.set_index(oth_cols, inplace=True)
        sub[:] = 0
        sub = sub.reset_index()
        sub['age_group_id'] = 3

        draws = draws.append(sub)

    draws.drop(['cause_id', 'measure_id', 'metric_id', 'sex_name'],
               axis=1,
               inplace=True)

    draws.set_index(['location_id', 'year_id', 'sex_id', 'age_group_id'],
                    inplace=True)

    mort = etl.df_to_xr(draws, wide_dim_name='draw', fill_value=np.nan)
    mort = mort / pops['population']  # gets it into rate space
    return mort

Example #11

0

Show file

def write_results(df, ecode, ncode, platform, year, version):
    """Function to write the results that are appended from the DisMod ODE."""
    out_dir = os.path.join("FILEPATH")
    if not os.path.exists(out_dir):
        try:
            os.makedirs(out_dir)
        except OSError as e:
            if e.errno != os.errno.EEXIST:
                raise
            pass
    
    filename = "FILEPATH.nc".format(year)
    filepath = os.path.join(out_dir, filename)
    df = help.convert_to_age_group_id(df)
    
    df.set_index(['location_id', 'year_id', 'sex_id', 'age_group_id', 'platform'], inplace = True)
    
    arr = etl.df_to_xr(df, wide_dim_name='draw', fill_value=np.nan)
    arr.to_netcdf(filepath)

Example #12

0

Show file

def outpatient_cov(me_id, drawcols):
    """ This function returns a single scalar value
    for the outpatient covariate """
    # get the best model version from DisMod panda cascade
    model_version = db.get_best_model_versions(
        entity='modelable_entity',
        ids=me_id,
        status='best',
        gbd_round_id=help.GBD_ROUND)['model_version_id'].iloc[0].astype(str)
    filepath = os.path.join(
        'FILEPATH',
        model_version,
        'FILEPATH.csv.gz')
    covars = pd.read_csv(filepath)['beta_incidence_x_s_outpatient']
    covars = np.exp(covars)[-1000:]
    covars = pd.DataFrame(covars)
    covars['draw'] = drawcols
    covars.set_index('draw', inplace=True)
    x_outcov = etl.df_to_xr(covars)
    return x_outcov

Example #13

0

Show file

def read_to_xr(location_id, ccf_asfr_fbd_path, dims):
    """
    Reads location-specific csv file into pd.DataFrame, and then return as
    xr.DataArray.

    Args:
        location_id (int): location id.
        ccf_asfr_fbd_path (FBDPath): folder path within ccf stage.  Contains an
            "asfr_single_year" or "asfr" sub-folder, where location-specific
            csv files are stored.
        dims (list[str]): list of dims expected from upstream asfr csv files.

    Returns:
        (xr.DataArray): location-specific asfr, not yet intercept-shifted.
    """
    file_fbd_path = ccf_asfr_fbd_path / f"{location_id}.csv"

    asfr = pd.read_csv(file_fbd_path)
    asfr = df_to_xr(asfr, dims=dims).sel(location_id=location_id)
    return asfr

Example #14

0

Show file

def en_matrix(ecode, sex_id, platform):
    """Returns e/n matrix, used to split an ecode into the 47 ncodes.
    
    Matrices vary by ecode, platform, sex, age and high income/not high income (the super region) countries.
    """
    if platform == "inpatient":
        short_plat = "inp"
    else:
        short_plat = "otp"
    # read the matrix file for the specified platform
    if inj_info.ECODE_PARENT[ecode] == 'inj_poisoning':
        e = 'inj_poisoning'
    else:
        e = ecode

    en_mat_dir = "FILEPATH"
    matrix = pd.read_csv(os.path.join(en_mat_dir, "FILEPATH.csv"))

    # subset the matrix to requested sex, and convert from ages to age_group_ids (copying values for under 1 age groups)
    matrix = matrix.loc[matrix['sex'] == sex_id]
    matrix = help.convert_to_age_group_id(matrix, collapsed_0=True)
    matrix = help.expand_under_1(matrix)

    # format matrix for incidence merge
    matrix.drop(['ecode'], inplace=True, axis=1)
    matrix.rename(columns={
        'n_code': 'ncode',
        'inpatient': 'platform',
        'sex': 'sex_id'
    },
                  inplace=True)
    plat_dict = {1: 'inpatient', 0: 'outpatient'}
    matrix['platform'] = matrix['platform'].replace(plat_dict)

    matrix.set_index(
        ['ncode', 'platform', 'high_income', 'sex_id', 'age_group_id'],
        inplace=True)
    x_matr = etl.df_to_xr(matrix, wide_dim_name='draw', fill_value=0)

    return x_matr

Example #15

0

Show file

File: step04n_lt_run.py Project: cheth-rowe/ihmexp

def write_results(df, ecode, ncode, platform, year, decomp, version):
    version = version.rstrip()
    out_dir = os.path.join(paths.DATA_DIR, decomp,
                           inj_info.ECODE_PARENT[ecode], str(version), "ode",
                           str(ecode), str(ncode), platform)
    if not os.path.exists(out_dir):
        try:
            os.makedirs(out_dir)
        except OSError as e:
            if e.errno != os.errno.EEXIST:
                raise
            pass

    df = help.convert_to_age_group_id(df)

    df.set_index(
        ['location_id', 'year_id', 'sex_id', 'age_group_id', 'platform'],
        inplace=True)

    arr = etl.df_to_xr(df, wide_dim_name='draw', fill_value=np.nan)

    filepath = write_path(ecode, ncode, platform, year, decomp, version)
    arr.to_netcdf(filepath)

Example #16

0

Show file

import numpy as np
import os

# Load the treated and untreated disability weights
dw_folder = os.path.join("FILEPATH")
untreat_dw = pd.read_csv(os.path.join(dw_folder, 'FILEPATH.csv'))
treated_dw = pd.read_csv(os.path.join(dw_folder, 'FILEPATH.csv'))

coldict = {'draw' + str(n): 'draw_' + str(n) for n in range(1000)}
coldict['n_code'] = 'ncode'
untreat_dw.rename(columns=coldict, inplace=True)
treated_dw.rename(columns=coldict, inplace=True)
untreat_dw.set_index('ncode', inplace=True)
treated_dw.set_index('ncode', inplace=True)

u_dw = etl.df_to_xr(untreat_dw, wide_dim_name='draw', fill_value=np.nan)
t_dw = etl.df_to_xr(treated_dw, wide_dim_name='draw', fill_value=np.nan)

dems = db.get_demographics(gbd_team='epi', gbd_round_id=help.GBD_ROUND)

# Get the percent treated in each country-year, and multiply by dws to get total dw
p_t = calculate_measures.pct_treated()
dw = t_dw * p_t + u_dw * (1 - p_t)

# Load in split proportions for spinal cord injuries and find weighted average disability weight among the 4 splits
n_parent = pd.Series(index=treated_dw.index,
                     data=[n[0:3] for n in treated_dw.index],
                     name='ncode_parent')
spinal_split_folder = 'FILEPATH'
drawdict = {'prop_' + d: d for d in help.drawcols()}
split_props_list = []

Example #17

0

Show file

def disability_weights_st():
    folder = 'FILEPATH'
    return etl.df_to_xr(pd.read_csv(os.path.join(
        folder, 'FILEPATH.csv')).set_index('ncode'),
                        wide_dim_name='draw',
                        fill_value=np.nan)

Example #18

0

Show file

def get_gbd_paf(acause, rei, cache_version, gbd_round_id, sex_ids,
                location_ids, draws, measure_id=4, metric_id=2):
    """
    Downloads and transforms gbd cause-risk-specific PAF.  The dataarray
    is then cleaned and saved in a FBDPath.

    The gbd paf coming from get_draws::
        >>> df.columns
        Index([u'rei_id', u'modelable_entity_id', u'location_id', u'year_id',
               u'age_group_id', u'sex_id', u'cause_id', u'measure_id',
               u'draw_0', u'draw_1', ... u'draw_991', u'draw_992', u'draw_993',
               u'draw_994', u'draw_995', u'draw_996', u'draw_997', u'draw_998',
               u'draw_999', u'metric_id'], dtype='object', length=1009)

    where we will need to
    1.) use cause_id to slice for the cause-risk pair
    2.) use measure_id (typically 4 for yll) to slice for measure_id
    3.) use metric_id (typically 2 for percent) to slice for metric_id

    Args:
        acause (str): analytical cause.
        rei (str): risk, could also be vaccine intervention.
        cache_version (str): the FBDPath paf version to save the gbd paf in,
            or to read from.
        gbd_round_id (int): gbd round id
        sex_ids (list): sexes.  Typically [1, 2].
        location_ids (list): locations to get pafs from.
        draws (int): number of draws for output file.  This means input files
            will be up/down-sampled to meet this criterion.
        measure_id (int, optional): typically the yll measure id (4).  At the
            most detailed PAF yll is equivalent to death, so measure_id 4 works
            the same as measure_id 1 (death).  Empirically, it seems to pull
            data faster if calling with meausre_id=4.
        metric_id (int, optional): typically the percent metric (2)

    Returns:
        (xr.DataArray/None): Dataarray with complete demographic indices,
            sans "scenario".
    """
    if rei in get_vaccine_reis(gbd_round_id):
        # get_draws won't have anything for vaccines
        return None

    cache_file_fbdpath =\
        FBDPath(gbd_round_id=gbd_round_id,
                past_or_future="past",
                stage="paf",
                version=cache_version) / (acause + "_" + rei + ".nc")

    if cache_file_fbdpath.exists():

        LOGGER.info("{} already exists.  Will read from it for gbd paf.".
                    format(cache_file_fbdpath))

        paf_da = open_xr(cache_file_fbdpath).data

        paf_da = paf_da.sel(location_id=location_ids)

        if len(paf_da["draw"]) != draws:
            paf_da = resample(paf_da, draws)

        return paf_da

    else:  # no cache exists, must download & clean
        rei_id = get_rei_id(rei)

        if acause in CAUSES_NOT_IN_GBD_MAP:  # edge case for diarrhea_*
            cause_id = get_cause_id(CAUSES_NOT_IN_GBD_MAP[acause])
        else:
            cause_id = get_cause_id(acause)

        gbd_round = get_gbd_round(gbd_round_id)

        try:
            # we only need it for year_id=gbd_round, but for every other dim
            # we collect everything.
            paf_df = get_draws(gbd_id_type=['cause_id', 'rei_id'],
                               gbd_id=[cause_id, rei_id],
                               source='burdenator',
                               year_id=gbd_round,
                               gbd_round_id=gbd_round_id,
                               measure_id=measure_id,
                               metric_id=metric_id)
        except Exception as exc:
            error_message = "Error in get_draws for {}_{}".format(acause, rei)
            LOGGER.error(error_message)
            raise IOError(str(exc))

        paf_df = paf_df.drop(columns=["year_id",
                                      "rei_id",
                                      "cause_id",
                                      "measure_id",
                                      "metric_id"])  # don't need these no more

        paf_da = df_to_xr(paf_df,
                          dims=["location_id", "age_group_id", "sex_id"],
                          wide_dim_name='draw',
                          wide_dim_transform=lambda x: int(x.split('_')[1]),
                          fill_value=np.nan)

        paf_da = paf_da.sortby("draw")  # draws don't always come in sorted

        paf_da = _data_cleaning_for_paf(paf_da, acause, rei, "GBD")

        LOGGER.info("Saving downloaded & cleaned {}".
                    format(cache_file_fbdpath))

        save_xr(paf_da, cache_file_fbdpath, metric="percent", space="identity",
                cause_id=cause_id, rei_id=rei_id, gbd_round_id=gbd_round_id,
                year_id=gbd_round, measure_id=measure_id, metric_id=metric_id,
                upper_bound=PAF_UPPER_BOUND, lower_bound=PAF_LOWER_BOUND)

    if len(paf_da["draw"]) != draws:
        paf_da = resample(paf_da, draws)

    return paf_da