Example #1
0
def save_birth_count_estimates(gbd_round_id: int, decomp_step: str,
                               cov_estimate_filepath: pathlib.PosixPath,
                               location_set_id: int,
                               most_detailed_locs: Set[int]) -> None:
    """
    we need to pull covariate estimates for each unique location_id,
    that's where save_birth_count_estimates comes in
    """

    index_cols = ['location_id', 'year_id', 'age_group_id', 'sex_id']

    df = get_covariate_estimates(
        mmr_constants.LIVE_BIRTHS_COVARIATE_ID,
        decomp_step=decomp_step,
        gbd_round_id=gbd_round_id,
        year_id=mmr_constants.OUTPUT_YEARS,
        age_group_id=mmr_constants.ALL_MOST_DETAILED_AGE_GROUP_IDS,
        location_set_id=location_set_id)

    # Because covariate is tagged to sex of baby but MMR numerator is only
    # females, we want aggregate sex and assign to female sex_id
    df.loc[:, 'sex_id'] = 2
    df = df.loc[:, index_cols +
                [mmr_constants.Columns.LIVE_BIRTH_VALUE_COL]].groupby(
                    index_cols).sum().reset_index()

    df = _filter_to_most_detailed_locs(df, most_detailed_locs)
    df.to_csv(cov_estimate_filepath, index=False)
Example #2
0
 def _get_covariates(self):
     cov = get_covariate_estimates(covariate_id=881,
                                   gbd_round_id=self._gbd_round_id,
                                   location_id=self.location_ids,
                                   year_id=self.year_id)
     cov.rename(columns={'mean_value': 'sdi'}, inplace=True)
     return cov[['location_id', 'sdi']].reset_index(drop=True)
Example #3
0
def get_live_births_summaries(location_ids, year_ids):
    # best model_version_id at time of upload - 24083
    lvbrth_cov_id = 1106  # live births by sex covariate
    births = get_covariate_estimates(covariate_id=lvbrth_cov_id,
                                     gbd_round_id=5,
                                     location_id=location_ids,
                                     year_id=year_ids,
                                     sex_id=[3, 2, 1])
    births = births[['location_id', 'year_id', 'sex_id', 'mean_value']]
    # Currently, the live births by sex covariate returns most_detailed
    # sex but we need both sexes combined. The data needed to aggregate sex are
    # contained in the returned dataframe.
    # Use that info to aggregate sex for this data adjustment
    both_sexes = births.copy()
    both_sexes.loc[:, 'sex_id'] = 3
    both_sexes = both_sexes.groupby(['location_id', 'year_id',
                                     'sex_id']).sum().reset_index()
    births = pd.merge(births,
                      both_sexes[['location_id', 'year_id', 'mean_value']],
                      how='left',
                      on=['location_id', 'year_id'],
                      suffixes=['', '_both'],
                      indicator=True)
    assert (births._merge == 'both').all()
    births.drop('_merge', axis=1, inplace=True)
    births['birth_prop'] = births['mean_value'] / births['mean_value_both']
    # merge in 'sex' column because data in eurocat spreadsheet does not have
    # sex_ids
    sex_meta = get_ids('sex')
    births = pd.merge(births, sex_meta, on='sex_id')

    return births
def add_covariate(df, covariate_id, covariate_column_name, by_sex=True):

    assert 'location_id' not in df.columns, "Unexpected df structure: has location_id"
    assert 'country' in df.columns, "Unexpected df structure: lacks country"

    merge_cols = ['country', 'year_id', 'sex_id']
    cov_df = None

    if not by_sex:
        merge_cols.remove('sex_id')

    # For covariates passed to me via flat files (aka .csvs), I manually
    # assign them negative covariate values. Handle them as a separate case
    if (covariate_id < 0):
        cov_df, merge_cols = get_flat_covariate_estimates(covariate_id)
    else:
        cov_df = db_queries.get_covariate_estimates(covariate_id, decomp_step="step1")

    cov_df = cov_df.rename(columns={
        'location_id': 'country',
        'mean_value': covariate_column_name
    })

    cov_df = cov_df[merge_cols + [covariate_column_name]]
    report_duplicates(cov_df, merge_cols)
    df = df.merge(cov_df, on=merge_cols, how='left')

    # As of 2/7/19, flat files go from 1990 - 2017, so missing
    # 80s throws an error. I deal with this in regression setup
    if (covariate_id > 0):
        report_if_merge_fail(df, covariate_column_name, merge_cols)
    return df
Example #5
0
 def pull_asfr(self):
     logger.info("Pulling ASFR...")
     asfr_df = get_covariate_estimates(covariate_id=13)
     asfr_df = asfr_df.ix[(asfr_df.age_group_id.isin(self.age_group_ids))
                          & (asfr_df.year_id.isin(self.year_id)) &
                          (asfr_df.sex_id.isin(self.sex_id))]
     return asfr_df[self.index_cols + ['mean_value']]
Example #6
0
def pull_reshape_tfr(gbd_round_id, tfr_version, location_ids):
    """Pulls year 2017 GBD round 5 TFR, converts it an xarray dataarray,
    pulls forecast TFR, and concatenates the dataarrays. The new array is
    then converted to a pandas dataframe. All required data are then reshaped
    and merged for downstream table production.

    Args:
        gbd_round_id (int):
            GBD round.
        tfr_version (str):
            Forecast TFR version.
        location_ids (list):
            List of location IDs to pull from both past and future data.
    Returns:
        tfr_final_df (pandas dataframe):
            Dataframe with all required TFR data, reshaped for downstream table
            production.
    """

    p_end = YEARS.past_end
    f_end = YEARS.forecast_end
    # Get 2017 GBD TFR
    tfr_2017 = get_covariate_estimates(covariate_id=149,
                                       gbd_round_id=gbd_round_id,
                                       location_id=location_ids, year_id=p_end,
                                       status="best")[[
    "year_id", "location_id","mean_value", "lower_value", "upper_value"
    ]].rename(columns={"mean_value":"mean", "lower_value":"lower",
                       "upper_value":"upper"})
    
    tfr_2017_da = melt_to_xarray(tfr_2017)
    
    # Get future TFR
    tfr_fut = open_xr(f"{gbd_round_id}/future/tfr/"
                      f"{tfr_version}/tfr_combined.nc").data
    
    tfr_fut_sel = tfr_fut.sel(location_id=location_ids, scenario=SCENARIOS,
                              year_id=YEARS.forecast_years)
    
    # Concat and make quantile wide
    tfr_da = xr.concat([tfr_2017_da, tfr_fut_sel], dim="year_id")
    
    tfr_df = tfr_da.to_dataframe().reset_index()
    tfr_df = tfr_df.pivot_table(values="value",
                                index=["location_id", "year_id", "scenario"],
                                columns="quantile").reset_index()
    
    # Combine value and UI into one column
    tfr_df = combine_mean_ui(tfr_df, df_type="tfr")
    
    # Get 2017 and 2100 values
    tfr2017 = tfr_df.query(f"year_id == {p_end} and scenario==0")
    tfr2100 = tfr_df.query(f"year_id == {f_end}")
    tfr2017 = pivot_scenarios(tfr2017, f"{p_end}", SCENARIO_MAP, df_type="tfr")
    tfr2100 = pivot_scenarios(tfr2100, f"{f_end}", SCENARIO_MAP, df_type="tfr")
    
    # Merge
    tfr_final_df = tfr2017.merge(tfr2100)
    
    return tfr_final_df
def get_5_year_haqi_cf(gbd_round_id,
                       decomp_step,
                       min_treat=0.1,
                       max_treat=0.75):
    """
    A function to get the health access quality covariates data which we'll
    use to divide our mean_raw values by to adjust our estimates up

    Parameters:
        min_treat: float
            minimum access. Sets a floor for the CF. If 0.1 then the lowest possible CF will be 0.1,
            in practice this is a 10x increase in the estimate
        max_treat: float or int
            maximum acess. Sets a cap for the CF. If 75 then any loc/year with a covariate above 75
            will have a CF of 1 and the data will be unchanged
    """
    # get a dataframe of haqi covariate estimates
    df = get_covariate_estimates(covariate_id=1099,
                                 gbd_round_id=gbd_round_id,
                                 decomp_step=decomp_step)
    df.rename(columns={'year_id': 'year_start'}, inplace=True)

    if df.mean_value.mean() > 1 and max_treat < 1:
        warn_msg = """Increasing max_treat variable 100X. Mean of the HAQi column is larger 
        than 1. We assume this means the range is from 0 to 100. Summary stats for the 
        mean_value column in the haqi covar are \n {}""".format(
            df.mean_value.describe())
        warnings.warn(warn_msg)
        max_treat = max_treat * 100

    # set the max value
    df.loc[df.mean_value > max_treat, 'mean_value'] = max_treat

    # get min df present in the data
    # Note, should this just be 0.1?
    min_df = df.mean_value.min()

    # make the correction
    df['haqi_cf'] = \
        min_treat + (1 - min_treat) * ((df['mean_value'] - min_df) / (max_treat - min_df))

    # drop the years outside of hosp_data so year binner doesn't break
    df['year_end'] = df['year_start']
    warnings.warn("Currently dropping HAQi values before 1988 and after 2017")
    df = df[(df.year_start > 1987) & (df.year_start < 2018)].copy()
    df = hosp_prep.year_binner(df)

    # Take the average of each 5 year band
    df = df.groupby(['location_id', 'year_start', 'year_end']).agg({
        'haqi_cf':
        'mean'
    }).reset_index()

    assert df.haqi_cf.max() <= 1, "The largest haqi CF is too big"
    assert df.haqi_cf.min() >= min_treat, "The smallest haqi CF is too small"

    return df
Example #8
0
def get_births(cfr_df):
	births_df = get_covariate_estimates(covariate_id = 1106, sex_id = [1,2], gbd_round_id = 5)
	births_df = births_df.drop(['model_version_id', 'covariate_id', 'covariate_name_short', 'location_name', 'age_group_id', 'age_group_name', 'lower_value', 'upper_value'], axis = 1)
	births_df = births_df.rename(index=str, columns={"year_id": "year", "mean_value": "births"})
	births_df = births_df.loc[births_df['year'].isin([1990, 1995, 2000, 2005, 2010, 2017])]
	births_df = births_df.loc[births_df['location_id'].isin(cfr_df['location_id'].unique())]
	births_df = births_df.rename(columns = {'sex_id': 'sex'})
	births_df = births_df.set_index(col_list)
	births_df = births_df.sortlevel()
	return births_df
Example #9
0
    def adjust_pms(self):
        '''Adjusts PMS (Pre-menstrual Syndrome) cases for pregnancy prevalence and incidence '''
        pms_key = self.me_map["pms"]["srcs"]["tot"]
        adj_key = self.me_map["pms"]["trgs"]["adj"]

        covariate_index_dimensions = [
            'age_group_id', 'location_id', 'sex_id', 'year_id'
        ]

        pms_df = self.me_dict[pms_key]
        asfr_df = get_covariate_estimates(13, decomp_step=self.decomp_step)
        sbr_df = get_covariate_estimates(2267, decomp_step=self.decomp_step)

        asfr_df = asfr_df.filter(covariate_index_dimensions + ['mean_value'])
        # Assumes the still birth rates covariate is reported for all_ages
        # and both sexes. Hence we ignore that and merge only on loc and year
        sbr_df = sbr_df.filter(['location_id', 'year_id', 'mean_value'])

        asfr_df.rename(columns={'mean_value': 'asfr_mean'}, inplace=True)
        sbr_df.rename(columns={'mean_value': 'sbr_mean'}, inplace=True)

        prop_df = asfr_df.merge(sbr_df,
                                how='inner',
                                on=['location_id', 'year_id'])
        prop_df[prop_df.sex_id == 1].sbr_mean = 0

        prop_df['prop'] = (prop_df.asfr_mean +
                           (prop_df.asfr_mean * prop_df.sbr_mean)) * 46 / 52
        prop_df.set_index(covariate_index_dimensions, inplace=True)

        adj_df = pms_df.copy()
        adj_df.reset_index(level='measure_id', inplace=True)
        adj_df = adj_df.merge(prop_df.prop,
                              how='inner',
                              left_index=True,
                              right_index=True)

        for col in self.draw_cols:
            adj_df[col] = adj_df[col] * (1 - adj_df.prop)

        adj_df.drop(columns='prop', inplace=True)

        self.me_dict[adj_key] = adj_df
Example #10
0
 def get_live_births_summaries(self):
     lvbrth_cov_id = 1106  #live births by sex
     births = get_covariate_estimates(
         covariate_id=lvbrth_cov_id,
         gbd_round_id=self.gbdr,
         location_id=self.dim_births['location_id'],
         year_id=self.dim_births['year_id'],
         sex_id=self.dim_births['sex_id'])
     births = births[['location_id', 'year_id', 'sex_id', 'mean_value']]
     births.rename(columns={'mean_value': 'births'}, inplace=True)
     self.lv_bir_frame = births
Example #11
0
def run_shared_funcs(mat):
    """
    get all the central inputs we'll need. Population and asfr and ifd covariates
    """
    years = list(np.arange(1988, 2018, 1))
    locs = mat.location_id.unique().tolist()
    ages = mat.age_group_id.unique().tolist()
    # get pop
    pop = get_population(age_group_id=ages,
                         location_id=locs,
                         year_id=years,
                         sex_id=[2])

    # GET ASFR and IFD
    # has age/location/year
    asfr = get_covariate_estimates(covariate_id=13,
                                   location_id=locs,
                                   age_group_id=ages,
                                   year_id=years)
    ifd = get_covariate_estimates(covariate_id=51)
    return pop, asfr, ifd
Example #12
0
def get_gbd_covariate_estimates(
        covariate_id: int,
        covariate_name_short: str,
        location_set_id: int,
        year_ids: List[int],
        gbd_round_id: int,
        decomp_step: str
) -> pd.DataFrame:
    """
    Pulls estimates for a single GBD covariate.

    Args:
        covariate_id: the covariate ID for which to pull estimates
        covariate_name_short: the short name of the covariate
        location_set_id: the location set ID for which to pull estimates
        year_ids: the year IDs for which to pull estimates
        gbd_round_id: the GBD round ID for which to pull estimates
        decomp_step: the decomp step for which to pull estimates

    Returns:
        Dataframe of demographic information and covariate estimates

    Raises:
        ValueError: if covariate does not have best values for a given
            GBD round and decomp step
    """
    logging.info(f'Pulling covariate {covariate_name_short}')
    covariate_df = (
        db_queries.get_covariate_estimates(
            covariate_id,
            location_set_id=location_set_id,
            year_id=year_ids,
            gbd_round_id=gbd_round_id,
            decomp_step=(
                None if gbd_round_id < 6
                else decomp_step
            )
        )
        .rename(columns={columns.MEAN_VALUE: covariate_name_short})
        [columns.DEMOGRAPHICS + [covariate_name_short]])

    if covariate_df.empty:
        raise ValueError(
            f'No best values for covariate {covariate_name_short} for '
            f'gbd round ID {gbd_round_id}, decomp step {decomp_step}'
        )

    return covariate_df
Example #13
0
def smoking(hierarchy: pd.DataFrame) -> pd.DataFrame:
    data = db_queries.get_covariate_estimates(
        gbd_round_id=6,
        decomp_step='iterative',
        covariate_id=282,  # Smoking Prevalence (Age-standardized, both sexes)
        year_id=2019,
    )
    data = (data.loc[:, ['location_id', 'mean_value']].rename(
        columns={
            'mean_value': 'smoking'
        }).set_index('location_id').sort_index())

    # pass down hierarchy
    data = parent_inheritance(data, hierarchy)
    data = data.squeeze()

    return data
Example #14
0
def uhc(hierarchy: pd.DataFrame) -> pd.DataFrame:
    data = db_queries.get_covariate_estimates(
        gbd_round_id=6,
        decomp_step='iterative',
        covariate_id=1097,  # Universal health coverage
        year_id=2019,
    )
    data = (data.loc[:,
                     ['location_id', 'mean_value']].rename(columns={
                         'mean_value': 'uhc'
                     }).set_index('location_id').sort_index())

    # pass down hierarchy
    data = parent_inheritance(data, hierarchy)
    data = data.squeeze()

    return data
Example #15
0
def haq(hierarchy: pd.DataFrame) -> pd.DataFrame:
    data = db_queries.get_covariate_estimates(
        gbd_round_id=6,
        decomp_step='iterative',
        covariate_id=1099,  # Healthcare access and quality index
        year_id=2019,
    )
    data = (data.loc[:,
                     ['location_id', 'mean_value']].rename(columns={
                         'mean_value': 'haq'
                     }).set_index('location_id').sort_index())

    # pass down hierarchy
    data = parent_inheritance(data, hierarchy)
    data = data.squeeze()

    return data
Example #16
0
def obesity(hierarchy: pd.DataFrame) -> pd.DataFrame:
    data = db_queries.get_covariate_estimates(
        gbd_round_id=6,
        decomp_step='iterative',
        covariate_id=455,  # Prevalence of obesity (age-standardized)
        year_id=2019,
    )

    # just averaging sexes here...
    data = (data.groupby('location_id')['mean_value'].mean().rename(
        'obesity').to_frame())

    # pass down hierarchy
    data = parent_inheritance(data, hierarchy)
    data = data.squeeze()

    return data
Example #17
0
 def get_asfr(self):
     '''Pulls the age-specific fertility rate, which is used in live birth
     calculation'''
     asfr_id = 13
     asfr = get_covariate_estimates(covariate_id=asfr_id,
                                    location_id=self.most_detailed_locs,
                                    sex_id=2,
                                    age_group_id=self.most_detailed_ages,
                                    year_id=self.year_id,
                                    status='best')
     asfr.rename(columns={'mean_value': 'asfr'}, inplace=True)
     keeps = ['location_id', 'year_id', 'age_group_id', 'sex_id', 'asfr']
     asfr = asfr[keeps]
     '''pull the most detailed age groups and set asfr for age groups 
     outside of our maternal age range to zero.'''
     asfr.loc[~asfr.age_group_id.isin(range(7, 16)), 'asfr'] = 0.
     return asfr
Example #18
0
 def get_asfr(self):
     '''Pulls the age-specific fertility rate, which is used in live birth
     calculation'''
     asfr_id = 13
     asfr = get_covariate_estimates(covariate_id=asfr_id,
                                    location_id=self.most_detailed_locs,
                                    sex_id=2,
                                    age_group_id=self.most_detailed_ages,
                                    year_id=self.year_id,
                                    status='best',
                                    decomp_step=self.decomp_step)
     asfr.rename(columns={'mean_value': 'asfr'}, inplace=True)
     keeps = ['location_id', 'year_id', 'age_group_id', 'sex_id', 'asfr']
     asfr = asfr[keeps]
     '''maternal age range is 10 to 54 years old. This corresponds to 
     age_group_ids 7 to 15 (inclusive). Set all age groups outside of 
     the maternal age range to zero '''
     asfr.loc[~asfr.age_group_id.isin(list(range(7, 16))), 'asfr'] = 0.
     return asfr
Example #19
0
def get_asfr(df, decomp_step):
    """
    The Stata script which applied the maternal adjustment is no longer being used
    We must now do this simple adjustment in Python. It's merely the sample size * asfr
    then cases / sample size
    """
    locs = df['location_id'].unique().tolist()
    age_groups = df['age_group_id'].unique().tolist()
    years = df['year_start'].unique().tolist()

    asfr = get_covariate_estimates(covariate_id=13, decomp_step=decomp_step,
                                   sex_id=2,
                                   age_group_id=age_groups,
                                   location_id=locs,
                                   year_id=years)

    asfr = asfr[['location_id', 'year_id', 'age_group_id', 'sex_id', 'mean_value']]
    asfr.rename(columns={'year_id': 'year_start'}, inplace=True)

    return asfr
def get_5_year_haqi_cf(min_treat=10, max_treat=75):
    """
    A function to get the health access quality covariates data which we'll
    use to divide our mean_raw values by to adjust our estimates

    Parameters:
        min_treat: float
            minimum access. Sets a floor for the CF. If 10 then the lowest possible CF will be 10,
        max_treat: float or int
            maximum acess. Sets a cap for the CF. If 75 then any loc/year with a covariate above 75
            will have a CF of 1 and the data will be unchanged
    """
    # get a dataframe of haqi covariate estimates
    df = get_covariate_estimates(covariate_id=1099)
    df.rename(columns={'year_id': 'year_start'}, inplace=True)

    # set the max value
    df.loc[df.mean_value > max_treat, 'mean_value'] = max_treat

    # get min df present in the data
    min_df = df.mean_value.min()

    # make the correction
    df['haqi_cf'] = \
        min_treat + (1 - min_treat) * ((df['mean_value'] - min_df) / (max_treat - min_df))

    # drop the early years so year binner doesn't break
    df['year_end'] = df['year_start']
    df = df[df.year_start > 1987].copy()
    df = hosp_prep.year_binner(df)

    # Take the average of each 5 year band
    df = df.groupby(['location_id', 'year_start', 'year_end']).agg({
        'haqi_cf':
        'mean'
    }).reset_index()

    assert df.haqi_cf.max() <= 1, "The largest haqi CF is too big"
    assert df.haqi_cf.min() >= min_treat, "The smallest haqi CF is too small"

    return df
Example #21
0
def get_covariates(covariate_id, covariate_model_id, location_set_version_id,
                   gbd_round_id, decomp_step_id, db_connection,
                   standard_location_set_version_id):
    """
    integer -> Pandas data frame

    Given an integer which represents a valid covariate ID will return a data
    frame which contains a unique value for each country, year, age group.
    This data may be aggregated in some form as well.
    """
    logger.info('Getting covariate estimates for covariate_id {} and '
                'covariate_model_id {} and decomp step {}.'.format(
                    covariate_id, covariate_model_id, decomp_step_id))
    loc_df = get_location_info(
        location_set_version_id=location_set_version_id,
        standard_location_set_version_id=standard_location_set_version_id,
        db_connection=db_connection)
    loc_list = loc_df.location_id.values.tolist()
    df = get_covariate_estimates(
        covariate_id=covariate_id,
        model_version_id=covariate_model_id,
        location_set_version_id=location_set_version_id,
        decomp_step=decomp_step_from_decomp_step_id(decomp_step_id),
        gbd_round_id=gbd_round_id)
    df = df.loc[df.location_id.isin(loc_list)]
    df = df[[
        'covariate_name_short', 'age_group_id', 'sex_id', 'year_id',
        'location_id', 'mean_value'
    ]]
    df.rename(columns={
        'age_group_id': 'age',
        'sex_id': 'sex',
        'year_id': 'year',
        'covariate_name_short': 'name'
    },
              inplace=True)
    df = df[['mean_value', 'age', 'sex', 'year', 'location_id', 'name']]
    df.rename(columns={'mean_value': df['name'].values[0]}, inplace=True)
    return df
Example #22
0
def get_live_births():
    live_birth_data = get_covariate_estimates(covariate_id=1106)

    # Filter down to Telangana and Andhra Pradesh
    temp_ap = live_birth_data.loc[live_birth_data['location_id'].isin([4841, 4871])].copy(deep=True)

    # Sum Telangana and Andhra Pradesh covariate numbers and population
    index_cols = ['model_version_id', 'covariate_id', 'covariate_name_short', 'location_id', 'location_name',
                  'year_id', 'age_group_id', 'age_group_name', 'sex_id']
    data_cols = ['mean_value', 'lower_value', 'upper_value']
    temp_ap['location_id'] = 44849
    temp_ap['location_name'] = "Old Andhra Pradesh"
    temp_ap = temp_ap.groupby(index_cols)[data_cols].sum().reset_index()

    live_birth_data = pd.concat([live_birth_data, temp_ap]).reset_index(drop=True)

    live_birth_data = live_birth_data.sort_values(['location_id', 'year_id', 'sex_id'])
    live_birth_data.loc[live_birth_data['sex_id'] == 1, 'sex'] = "male"
    live_birth_data.loc[live_birth_data['sex_id'] == 2, 'sex'] = "female"
    live_birth_data.loc[live_birth_data['sex_id'] == 3, 'sex'] = "both"
    live_birth_data['year'] = live_birth_data['year_id']
    live_birth_data['births'] = live_birth_data['mean_value']

    return live_birth_data
def generate_covariate_data():
    """."""
    cov_df = get_covariate_estimates(covariate_id=881)
    cov_df = cov_df[(cov_df.age_group_id == 22) & (cov_df.year_id == 2016)]
    cov_df.rename(columns={'mean_value': 'sdi_value'}, inplace=True)
    return cov_df[['location_id', 'sdi_value']].copy(deep=True)
Example #24
0
    def execute(self):
        src_meid = 2929
        src_mvid = 326309
        trg_meid = 18777
        out_dir = os.path.join(base, str(trg_meid))
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
            os.makedirs(out_dir)
        else:
            os.makedirs(out_dir)
        data_dir = base  # where the loc_meta data will go

        # export data for individual job import to reduce hits to database
        # location_set_id 22 has the same locations as 35
        keeps = [
            'location_id', 'super_region_id', 'super_region_name', 'region_id',
            'region_name', 'ihme_loc_id'
        ]
        loc_meta = get_location_metadata(location_set_id=35, gbd_round_id=5)
        loc_list = loc_meta.loc[loc_meta.most_detailed == 1,
                                'location_id'].tolist()
        loc_meta = loc_meta[keeps]
        loc_meta.to_csv(os.path.join(data_dir, 'location_metadata.csv'),
                        index=False,
                        encoding='utf8')
        iod_salt_cov = 46
        iod_salt = get_covariate_estimates(covariate_id=iod_salt_cov,
                                           gbd_round_id=5)
        iod_salt.to_csv(os.path.join(data_dir, 'iod_salt_cov.csv'),
                        index=False,
                        encoding='utf8')

        job_string = ''
        for loc in loc_list:
            job_name = "adjust_iodID_{}".format(loc)
            job_string = job_string + "," + job_name
            call = ('qsub -hold_jid {hj} -pe multi_slot 2'
                    ' -cwd -P proj_custom_models'
                    ' -o {o}'
                    ' -e {e}'
                    ' -N {jn}'
                    ' stata_shell.sh'
                    ' prep_cretin_dismod_output_for_upload.do'
                    ' {arg1} {arg2} {arg3} {arg4}'.format(hj=self.hold,
                                                          o=output_path,
                                                          e=error_path,
                                                          jn=job_name,
                                                          arg1=out_dir,
                                                          arg2=data_dir,
                                                          arg3=loc,
                                                          arg4=src_mvid))
            subprocess.call(call, shell=True)

        # Save the results
        save_hold = job_string
        iod_adj_description = "Adjustments made on meid {}, mvid {}".format(
            src_meid, src_mvid)

        save_params = [
            str(trg_meid), "--description",
            "\'{}\'".format(iod_adj_description), "--input_dir", out_dir,
            "--best", "--sexes", "1", "2", "--meas_ids", "5", "--file_pattern",
            "{measure_id}_{location_id}_{year_id}_{sex_id}.csv"
        ]

        save_job_name = 'iod_adj_save_{}'.format(trg_meid)
        call = ('qsub -hold_jid {hj} -pe multi_slot 13'
                ' -cwd -P proj_custom_models'
                ' -o {o}'
                ' -e {e}'
                ' -N {jn}'
                ' python_shell.sh'
                ' save.py'
                ' {arg1}'.format(hj=save_hold,
                                 o=output_path,
                                 e=error_path,
                                 jn=save_job_name,
                                 arg1=' '.join(save_params)))
        subprocess.call(call, shell=True)
        return save_job_name
Example #25
0
def fix_maternal_denominators(df, return_only_maternal=False):

    asfr = get_covariate_estimates(QUERY)

    # keep age/location/year and the critical mean_value
    asfr = asfr[['location_id', 'year_id', 'age_group_id', 'sex_id',
                 'mean_value']]
    asfr.drop_duplicates(inplace=True)

    # map age_start and age_end onto asfr
    age_group = query("QUERY")
    pre_asfr = asfr.shape[0]
    asfr = asfr.merge(age_group, how='left', on='age_group_id')
    assert pre_asfr == asfr.shape[0],\
    "The merge duplicated rows unexpectedly"
    asfr.drop('age_group_id', axis=1, inplace=True)
    asfr.rename(columns={'age_group_years_start': 'age_start',
                         'age_group_years_end': 'age_end'},
                inplace=True)
    # create year_start and year_end
    asfr['year_start'] = asfr['year_id']
    asfr['year_end'] = asfr['year_id']
    asfr.drop('year_id', axis=1, inplace=True)

    # all the mean_values in asfr where age_end is less than one are 0, so we
    # can make up an asfr group for age start = 0 and age_end = 1
    asfr.loc[asfr['age_end'] < 1, 'age_end'] = 1
    asfr.loc[asfr['age_start'] < 1, 'age_start'] = 0

    asfr.loc[asfr['age_end'] > 1, 'age_end'] = asfr.loc[asfr['age_end'] > 1,
             'age_end'] - 1

    # one more change, asfr has the max age end as 125 (now 124), and we want
    # it to be 99
    asfr.loc[asfr['age_end'] == 124, 'age_end'] = 99  # now asfr age_start
    # and age_end match our hospital data

    # and incase we created duplicated rows by doing this:
    asfr.drop_duplicates(inplace=True)

    # MERGE ASFR ONTO HOSP
    pre_shape = df.shape[0]
    df = df.merge(asfr, how='left', on=['age_start', 'age_end', 'year_start',
                                        'year_end', 'location_id', 'sex_id'])
    assert df.mean_value.isnull().sum() != df.shape[0],\
    "The merge failed to attach any mean_values"
    assert pre_shape == df.shape[0],\
    "The merge duplicated rows unexpectedly"

    # GET MATERNAL CAUSES
    # query causes
    causes = get_cause_metadata(QUERY)
    condition = causes.path_to_top_parent.str.contains("366")
    
    maternal_causes = causes[condition]

    # make list of maternal causes
    maternal_list = list(maternal_causes['cause_id'].unique())

    maternal_df = df[df['cause_id'].isin(maternal_list)]  # subset out rows that
    # are in maternal list
    assert maternal_df.shape[0] != 0,\
    "The maternal dataframe is empty"

    df = df[~df['cause_id'].isin(maternal_list)]  # subset out rows that
    # are not in the maternal list
    assert df.shape[0] != 0,\
    "The hospital dataframe is empty"
    for cause in maternal_list:
        
        maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] =\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] /\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value']
        maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] =\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] /\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value']
        maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] =\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] /\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value']
        # some mean_valued were zero, this is effectively an age/sex restriction
        # assign these a rate of 0
        maternal_df.loc[(maternal_df['product'].isnull()) & (maternal_df['cause_id'] == cause), ['product', 'upper_product', 'lower_product']] = 0

        # assign infinite values to 0
        maternal_df.loc[(np.isinf(maternal_df['product'])) & (maternal_df['cause_id'] == cause), ['product', 'upper_product', 'lower_product']] = 0


    if return_only_maternal == True:
        maternal_df.drop(['mean_value', 'cause_id'], axis=1, inplace=True)
        return(maternal_df)
    else:
        df = pd.concat([df, maternal_df])  # bring data back together

        # DROP ASFR info
        df.drop(['mean_value', 'cause_id'], axis=1, inplace=True)

        return(df)
Example #26
0
def fix_maternal_denominators(df, return_only_maternal=False):

    # At this point, data will have bundle_id and cause_id on it,
    #   but it has not been collapsed to those levels. it is at the
    #   baby seq level but as of 4-24-2017 data will be at bundle level
    # 2) aquired asfr from the database
    # 3) attach age_start and age_end to asfr, and create year_start and
    # year_end out of year_id
    # 4) attach asfr to the hospital data
    # 5) where cause_id is a maternal cause, do the division
    # 6) Then drop all the asfr info. namely, 'mean_value'

    # GET ASFR
    # has age/location/year
    asfr = get_covariate_estimates(covariate_id=13)

    # keep age/location/year and the critical mean_value
    asfr = asfr[[
        'location_id', 'year_id', 'age_group_id', 'sex_id', 'mean_value'
    ]]
    asfr.drop_duplicates(inplace=True)

    # map age_start and age_end onto asfr
    age_group = query("QUERY")
    pre_asfr = asfr.shape[0]
    asfr = asfr.merge(age_group, how='left', on='age_group_id')
    assert pre_asfr == asfr.shape[0],\
    "The merge duplicated rows unexpectedly"
    asfr.drop('age_group_id', axis=1, inplace=True)
    asfr.rename(columns={
        'age_group_years_start': 'age_start',
        'age_group_years_end': 'age_end'
    },
                inplace=True)
    # create year_start and year_end
    asfr['year_start'] = asfr['year_id']
    asfr['year_end'] = asfr['year_id']
    asfr.drop('year_id', axis=1, inplace=True)

    # The below commented out line of code was very wrong, asfr has three under
    # under one years old age groups, but our data is just 0-1 years old.
    # additionaly, this would turn age ends like 1 into zero, which is wrong.
    # asfr['age_end'] = asfr['age_end'] - 1

    # all the mean_values in asfr where age_end is less than one are 0, so we
    # can make up an asfr group for age start = 0 and age_end = 1
    asfr.loc[asfr['age_end'] < 1, 'age_end'] = 1
    asfr.loc[asfr['age_start'] < 1, 'age_start'] = 0

    # THIS IS SO IMPORTANT, our data has
    # age_end as 14, 19, 24, while asfr has age_end as 15, 20, 25 ...
    asfr.loc[asfr['age_end'] > 1,
             'age_end'] = asfr.loc[asfr['age_end'] > 1, 'age_end'] - 1

    # one more change, asfr has the max age end as 125 (now 124), and we want
    # it to be 99
    asfr.loc[asfr['age_end'] == 124, 'age_end'] = 99  # now asfr age_start
    # and age_end match our hospital data

    # and incase we created duplicated rows by doing this:
    asfr.drop_duplicates(inplace=True)

    # MERGE ASFR ONTO HOSP
    pre_shape = df.shape[0]
    df = df.merge(asfr,
                  how='left',
                  on=[
                      'age_start', 'age_end', 'year_start', 'year_end',
                      'location_id', 'sex_id'
                  ])
    assert df.mean_value.isnull().sum() != df.shape[0],\
    "The merge failed to attach any mean_values"
    assert pre_shape == df.shape[0],\
    "The merge duplicated rows unexpectedly"

    # GET MATERNAL CAUSES
    # query causes
    causes = get_cause_metadata(cause_set_id=9)
    condition = causes.path_to_top_parent.str.contains("366")  # 366 happens
    # to always be in the third level

    # subset just causes that meet the condition sdf
    maternal_causes = causes[condition]

    # make list of maternal causes
    maternal_list = list(maternal_causes['cause_id'].unique())

    # subset out parts of data that have asfr info
    # loop over cause_ids that are in maternal_list
    # divide 'mean' by 'mean_value' and overwrite mean, upper, or lower,
    # as relevant.
    maternal_df = df[df['cause_id'].isin(
        maternal_list)]  # subset out rows that
    # are in maternal list
    assert maternal_df.shape[0] != 0,\
    "The maternal dataframe is empty"

    df = df[~df['cause_id'].isin(maternal_list)]  # subset out rows that
    # are not in the maternal list
    assert df.shape[0] != 0,\
    "The hospital dataframe is empty"
    for cause in maternal_list:
        # the line breaks are weird looking but this is just assiging a value
        # to the result of division
        maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] =\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] /\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value']
        maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] =\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] /\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value']
        maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] =\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] /\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value']
        # some mean_valued were zero, this is effectively an age/sex restriction
        # assign these a rate of 0
        maternal_df.loc[(maternal_df['product'].isnull()) &
                        (maternal_df['cause_id'] == cause),
                        ['product', 'upper_product', 'lower_product']] = 0

        # assign infinite values to 0
        maternal_df.loc[(np.isinf(maternal_df['product'])) &
                        (maternal_df['cause_id'] == cause),
                        ['product', 'upper_product', 'lower_product']] = 0

    if return_only_maternal == True:
        maternal_df.drop(['mean_value', 'cause_id'], axis=1, inplace=True)
        return (maternal_df)
    else:
        df = pd.concat([df, maternal_df])  # bring data back together

        # DROP ASFR info
        df.drop(['mean_value', 'cause_id'], axis=1, inplace=True)

        return (df)