Esempio n. 1
0
def get_epi_inc(dim, cv, years, codes, ndraws):
    # grab the age ids
    age_ids = pd.read_csv(os.path.join(
        code_dir,
        "convert_to_new_age_ids.csv")).rename(columns={'age_start': 'age'})
    # get the EN incidence -- not short-term or long-term because all incidence is captured in short-term
    estim_sg = SuperGopher({'file_pattern': 'FILEPATH'},
                           os.path.join('FILEPATH'))
    estim_df = estim_sg.content(
        location_id=dim.index_dim.get_level("location_id"),
        year_id=years,
        sex_id=dim.index_dim.get_level("sex_id"))
    # merge on age group ids
    estim_df = pd.merge(estim_df, age_ids, on='age')
    estim_df.drop('age', inplace=True, axis=1)
    # keep age group id 2 and triplicate so that we can have 3 sets
    # this is to redistribute the incidence in under 1 age groups
    # with population fractions for age groups 2 3 and 4
    todupe1 = estim_df.ix[(estim_df['age_group_id'] == 2)]
    todupe1['age_group_id'] = 3
    todupe2 = estim_df.ix[(estim_df['age_group_id'] == 2)]
    todupe2['age_group_id'] = 4
    estim_df = estim_df.append(todupe1)
    estim_df = estim_df.append(todupe2)
    # get the population -- don't query database every time
    pops = pd.read_stata(os.path.join(root_j_dir, "FILEPATH"))
    # MAKE POPULATION FRACTIONS
    fullpops = pd.merge(pops, age_ids, on='age_group_id')
    fullpops['collapsed_age'] = fullpops['age']
    fullpops.loc[fullpops.age < 1, 'collapsed_age'] = 0
    pops = fullpops.copy()
    pops.loc[pops.age < 1, 'age'] = 0
    pops = pops[['location_id', 'year_id', 'sex_id', 'age', 'population']]
    pops = pops.groupby(['location_id', 'year_id', 'sex_id',
                         'age']).sum().reset_index()
    pops = pops.rename(columns={
        'population': 'total_pop',
        'age': 'collapsed_age'
    })
    popfracts = pd.merge(
        fullpops,
        pops,
        on=['location_id', 'year_id', 'collapsed_age', 'sex_id'])
    popfracts[
        'pop_fraction'] = popfracts['population'] / popfracts['total_pop']
    popfracts = popfracts[[
        'age_group_id', 'location_id', 'year_id', 'sex_id', 'pop_fraction'
    ]]
    # redistribute inc. under 1 (ALL OF THE OTHER POP FRACTIONS SHOULD BE 1)
    estim_df = pd.merge(
        estim_df,
        popfracts,
        how='left',
        on=['location_id', 'year_id', 'sex_id', 'age_group_id'])
    cols = ["draw_" + ` i ` for i in range(0, 999)]
Esempio n. 2
0
def compute_global_ratios(year_id, drawcols):
    eng = ezfuncs.get_engine(conn_def="cod")
    ccv = pd.read_sql("""
        SELECT output_version_id FROM cod.output_version
        WHERE code_version=4 AND is_best=1""", eng).squeeze()
    sg = SuperGopher({
        'file_pattern': '{measure_id}_{location_id}.h5',
        'h5_tablename': 'draws'},
        'filepath/codcorrect/{ccv}/draws'.format(ccv=ccv))
    ylls = sg.content(location_id=1, year_id=year_id, sex_id=[1, 2],
                      measure_id=4)

    ratios = []
    for resid_cid, yldmap in rkey.groupby('input_cause_id'):
        # get the ylls
        these_ylls = ylls[ylls.cause_id == resid_cid]
        ratio_ylls = ylls[ylls.cause_id.isin(yldmap.ratio_cause_id.unique())]

        # aggregate the inputs to the appropriate level
        group_cols = ['age_group_id', 'year_id']
        these_ylls = these_ylls.groupby(group_cols)
        these_ylls = these_ylls[drawcols].sum().mean(axis=1)
        ratio_ylls = ratio_ylls.groupby(group_cols)
        ratio_ylls = ratio_ylls[drawcols].sum().mean(axis=1)

        # compute the ratio
        ratio = these_ylls / ratio_ylls
        ratio = ratio.reset_index()
        ratio = ratio.replace(np.inf, 0)
        ratio = ratio.replace(np.NaN, 0)

        ratio["cause_id"] = resid_cid
        ratios.append(ratio)

    df = pd.concat(ratios)
    df_male = df.copy()
    df_male["sex_id"] = 1
    df_female = df.copy()
    df_female["sex_id"] = 2

    return df_male.append(df_female)
Esempio n. 3
0
def get_annual_inc(dim, cv, years, codes, ndraws):
    location_id = dim.index_dim.get_level("location_id")[0]
    sex_id = dim.index_dim.get_level("sex_id")[0]
    year_id = dim.index_dim.get_level("year_id")[0]
    # get inpatient incidence
    annual_inp = SuperGopher({'file_pattern': 'FILEPATH'},
                             os.path.join('FILEPATH'))
    annual_inp_df = annual_inp.content(
        location_id=dim.index_dim.get_level("location_id"),
        sex_id=dim.index_dim.get_level("sex_id"))
    annual_inp_df = annual_inp_df.loc[(annual_inp_df.year_id == year_id)]
    # get outpatient incidence
    annual_otp = SuperGopher({'file_pattern': 'FILEPATH'},
                             os.path.join('FILEPATH'))
    annual_otp_df = annual_otp.content(
        location_id=dim.index_dim.get_level("location_id"),
        sex_id=dim.index_dim.get_level("sex_id"))
    annual_otp_df = annual_otp_df.loc[(annual_otp_df.year_id == year_id)]
    # bind the inpatient and outpatient data frames together and groupby to collapse over inpatient
    annual_df = annual_otp_df.append(annual_inp_df)
    annual_df.drop(['inpatient'], inplace=True, axis=1)
    annual_df = injize(annual_df,
                       dim,
                       cv,
                       measure=6,
                       years=years,
                       fixage=True,
                       codes=codes,
                       shock=True,
                       ndraws=ndraws)
    return annual_df
Esempio n. 4
0
def get_epi_prev(dim, cv, years, codes, ndraws):
    # grab the short-term EN prevalence
    estim_st = SuperGopher({'file_pattern': 'FILEPATH'},
                           os.path.join("FILEPATH"))
    estim_st_df = estim_st.content(
        location_id=dim.index_dim.get_level("location_id"),
        year_id=years,
        sex_id=dim.index_dim.get_level("sex_id"))
    estim_st_df["term"] = "short-term"
    # grab the long-term EN prevalence
    estim_lt = SuperGopher({'file_pattern': 'FILEPATH'},
                           os.path.join("FILEPATH"))
    estim_lt_df = estim_lt.content(
        location_id=dim.index_dim.get_level("location_id"),
        year_id=years,
        sex_id=dim.index_dim.get_level("sex_id"))
    estim_lt_df["term"] = "long-term"
    estim_lt_df = estim_lt_df.loc[(estim_lt_df.ecode != "inj_war_warterror")
                                  & (estim_lt_df.ecode != "inj_war_execution")
                                  & (estim_lt_df.ecode != "inj_disaster")]
    # append short-term and long-term prevalence datasets together
    estim_df = estim_st_df.append(estim_lt_df)
    # these columns are unnecessary
    estim_df.drop(['prob_draw_', 'inpatient'], axis=1, inplace=True)
    # injury-ize based on the gbd requirements
    estim_df = injize(estim_df,
                      dim,
                      cv,
                      measure=5,
                      years=years,
                      fixage=True,
                      codes=codes,
                      shock=False,
                      ndraws=ndraws)
    return estim_df
Esempio n. 5
0
def get_annual_prev(dim, cv, years, codes, ndraws):
    # set vars
    location_id = dim.index_dim.get_level("location_id")[0]
    sex_id = dim.index_dim.get_level("sex_id")[0]
    year_id = dim.index_dim.get_level("year_id")[0]
    # get short-term annual results
    annual_st = SuperGopher({'file_pattern': 'FILEPATH'},
                            os.path.join('FILEPATH'))
    annual_st_df = annual_st.content(
        location_id=dim.index_dim.get_level("location_id"),
        year_id=dim.index_dim.get_level("year_id"),
        sex_id=dim.index_dim.get_level("sex_id"))
    annual_st_df["term"] = "short-term"
    # get long-term annual results
    annual_lt = SuperGopher({'file_pattern': 'FILEPATH'},
                            os.path.join('FILEPATH'))
    annual_lt_df = annual_lt.content(
        location_id=dim.index_dim.get_level("location_id"),
        year_id=dim.index_dim.get_level("year_id"),
        sex_id=dim.index_dim.get_level("sex_id"))
    annual_lt_df["term"] = "long-term"
    # combine them in one data frame
    annual_df = annual_st_df.append(annual_lt_df)
    # drop them
    annual_df.drop(['inpatient', 'prob_draw_', 'term'], axis=1, inplace=True)
    # injury-ize based on the gbd requirements
    annual_df = injize(annual_df,
                       dim,
                       cv,
                       measure=5,
                       years=years,
                       fixage=True,
                       codes=codes,
                       shock=True,
                       ndraws=ndraws)
    return annual_df
Esempio n. 6
0
    def _get_short_term_EN_annual(self, dim):
        # get non interpolated values
        annual_sg = SuperGopher(
            {'file_pattern': '{location_id}/ylds_{year_id}_{sex_id}.dta'},
            os.path.join("filepath", "FILEPATH"))
        annual_df = annual_sg.content(
            location_id=dim.index_dim.get_level("location_id"),
            year_id=dim.index_dim.get_level("year_id"),
            sex_id=dim.index_dim.get_level("sex_id"))

        # clean data
        annual_df = annual_df.merge(self.como_version.cause_list,
                                    left_on="ecode",
                                    right_on="acause")
        annual_df = annual_df.merge(self.como_version.ncode_hierarchy,
                                    left_on="ncode",
                                    right_on="rei")
        annual_df["age"] = annual_df["age"].round(2).astype(str)
        ridiculous_am = {
            '0.0': 2,
            '0.01': 3,
            '0.1': 4,
            '1.0': 5,
            '5.0': 6,
            '10.0': 7,
            '15.0': 8,
            '20.0': 9,
            '25.0': 10,
            '30.0': 11,
            '35.0': 12,
            '40.0': 13,
            '45.0': 14,
            '50.0': 15,
            '55.0': 16,
            '60.0': 17,
            '65.0': 18,
            '70.0': 19,
            '75.0': 20,
            '80.0': 30,
            '85.0': 31,
            '90.0': 32,
            '95.0': 235
        }
        annual_df["age"] = annual_df["age"].replace(ridiculous_am).astype(int)
        annual_df.rename(columns={"age": "age_group_id"}, inplace=True)

        # transform to rate
        annual_df = transform_metric(annual_df, 3, 1)

        # collapse inpatient
        annual_df = annual_df.groupby([
            "location_id", "year_id", "age_group_id", "sex_id", "cause_id",
            "rei_id"
        ]).sum().reset_index()

        # fill demographics
        gbdizer = gbdize.GBDizeDataFrame(dim)
        annual_df = gbdizer.add_missing_index_cols(annual_df)
        annual_df = gbdizer.gbdize_any_by_dim(annual_df, "age_group_id")
        annual_df.fillna(0, inplace=True)

        # resample if necessary
        annual_df = self.resample_if_needed(annual_df, dim, gbdizer)
        return annual_df
Esempio n. 7
0
    def _get_short_term_EN_estimation(self, dim):
        # get non interpolated values
        estim_sg = SuperGopher(
            {'file_pattern': '{location_id}/ylds_{year_id}_{sex_id}.dta'},
            os.path.join("filepath", "03_outputs/01_draws/ylds"))
        years = list(
            set(
                cap_val(dim.index_dim.levels.year_id,
                        [1990, 1995, 2000, 2005, 2010, 2016]) + [2005]))
        estim_df = estim_sg.content(
            location_id=dim.index_dim.get_level("location_id"),
            year_id=years,
            sex_id=dim.index_dim.get_level("sex_id"))

        # clean data
        estim_df = estim_df.merge(self.como_version.cause_list,
                                  left_on="ecode",
                                  right_on="acause")
        estim_df = estim_df.merge(self.como_version.ncode_hierarchy,
                                  left_on="ncode",
                                  right_on="rei")
        estim_df["age"] = estim_df["age"].round(2).astype(str)
        ridiculous_am = {
            '0.0': 2,
            '0.01': 3,
            '0.1': 4,
            '1.0': 5,
            '5.0': 6,
            '10.0': 7,
            '15.0': 8,
            '20.0': 9,
            '25.0': 10,
            '30.0': 11,
            '35.0': 12,
            '40.0': 13,
            '45.0': 14,
            '50.0': 15,
            '55.0': 16,
            '60.0': 17,
            '65.0': 18,
            '70.0': 19,
            '75.0': 20,
            '80.0': 30,
            '85.0': 31,
            '90.0': 32,
            '95.0': 235
        }
        estim_df["age"] = estim_df["age"].replace(ridiculous_am).astype(int)
        estim_df.rename(columns={"age": "age_group_id"}, inplace=True)

        # transform to rate
        estim_df = transform_metric(estim_df, 3, 1)

        # collapse inpatient
        estim_df = estim_df.groupby([
            "location_id", "year_id", "age_group_id", "sex_id", "cause_id",
            "rei_id"
        ]).sum().reset_index()

        # fill demographics
        data_cols = ["draw_{}".format(i) for i in range(1000)]
        gbdizer = gbdize.GBDizeDataFrame(dim)
        estim_df = gbdizer.add_missing_index_cols(estim_df)
        estim_df = gbdizer.gbdize_any_by_dim(estim_df, "age_group_id")
        estim_df.fillna(0, inplace=True)

        if gbdizer.missing_values(estim_df, "year_id"):
            estim_df = gbdizer.fill_year_by_interpolating(
                df=estim_df,
                rank_df=estim_df[estim_df["year_id"] == 2005],
                data_cols=data_cols)
        estim_df = estim_df[estim_df.year_id.isin(
            dim.index_dim.get_level("year_id"))]

        # resample if necessary
        estim_df = self.resample_if_needed(estim_df, dim, gbdizer)
        return estim_df
Esempio n. 8
0
    def read_inputs(self):
        """get como draws for a single modelable_entity/model_version"""
        print('Reading draws for (meid, mvid): ({}, {})'.format(
            self.meid, self.mvid))
        if self.super_gopher is None:
            self.super_gopher = SuperGopher.auto(self.meid_data_dir)

        all_draws = []
        reference_draws = []
        missing_dim_q = []
        for dimensions in self.dimensions_q:

            gbdizer = gbdize.GBDizeDataFrame(dimensions)
            try:
                draws = self.super_gopher.content(
                    location_id=dimensions.index_dim.get_level("location_id"),
                    year_id=dimensions.index_dim.get_level("year_id"),
                    sex_id=dimensions.index_dim.get_level("sex_id"),
                    measure_id=dimensions.index_dim.get_level("measure_id"),
                    age_group_id=dimensions.index_dim.get_level(
                        "age_group_id"))
            except InvalidFilter:
                draws = pd.DataFrame(columns=dimensions.index_names)

            if not draws.empty:
                # gbdize. aka fill in missing dimensions
                draws = self.gbdize_dimensions(draws, gbdizer)
                # keep a copy of all 1000 draws for interpolation
                reference_draws.append(draws)

                # resample
                draws = self.resample_if_needed(draws, dimensions, gbdizer)

            if len(draws) != dimensions.total_cardinality:
                missing = self.missing_dimensions(draws, dimensions)
                missing_dim_q.append(missing)

            all_draws.append(draws)

        # prep for interpolation of missing demographics
        if len(reference_draws) > 0:
            reference_draws = pd.concat(reference_draws)
        else:
            reference_draws = pd.DataFrame(columns=dimensions.index_names)
        missing_dim_q = list(flatten(missing_dim_q))

        for dimensions in missing_dim_q:

            gbdizer = gbdize.GBDizeDataFrame(dimensions)
            interp_draws, rank_df = self.get_interpolation_draws(
                reference_draws, dimensions)

            if not interp_draws.empty:
                # gbdize. aka fill in missing dimensions
                interp_draws = self.gbdize_dimensions(interp_draws, gbdizer)
                rank_df = self.gbdize_dimensions(rank_df, gbdizer)

                # case where years are stored as floats, breaks interpolate
                interp_draws['year_id'] = interp_draws['year_id'].astype(int)
                try:
                    data_cols = ["draw_{}".format(i) for i in range(1000)]
                    interp_draws = gbdizer.fill_year_by_interpolating(
                        interp_draws, rank_df, data_cols)
                except MissingGBDemographics:
                    print(
                        "(meid: {meid}, mvid: {mvid}) "
                        " Could not interpolate for years: {years}, "
                        "measure: {meas} "
                        "location_id: {loc} "
                        "sex_id: {sex}".format(
                            meid=self.meid,
                            mvid=self.mvid,
                            years=dimensions.index_dim.get_level("year_id"),
                            meas=dimensions.index_dim.get_level("measure_id"),
                            loc=dimensions.index_dim.get_level("location_id"),
                            sex=dimensions.index_dim.get_level("sex_id")))
                    interp_draws = self.gbdize_dimensions(
                        interp_draws, gbdizer, "year_id")

                # append draws to reference
                reference_draws = reference_draws.append(interp_draws,
                                                         ignore_index=True)

                draws = interp_draws.loc[interp_draws['year_id'].isin(
                    dimensions.index_dim.get_level('year_id'))]

                # resample
                draws = self.resample_if_needed(draws, dimensions, gbdizer)
                all_draws.append(draws)

            # if dimensions overlap, drop duplicates from reference draws
            reference_draws.drop_duplicates(subset=dimensions.index_names,
                                            inplace=True)

        # concatenate all the results
        draws = pd.concat(all_draws)
        # in case dimensions overlap, drop duplicates
        draws.drop_duplicates(inplace=True)
        draws['modelable_entity_id'] = self.meid

        return draws.reset_index(drop=True)