Beispiel #1
0
    def get_data(self, model_version_id, id_template_df):
        print("Loading CSMR csv...")
        # NOW PULLS A DF INSTEAD OF QUERYING THE DATABASE
        # Required fields:
        # location_id, year_id, age_group_id, sex_id, mean, upper, lower
        # The old way: Getting mortality data from a csv
        # data_filepath = '/ihme/gbd/WORK/04_epi/01_database/02_data/tb/csmr/custom_csmr.csv'

        age_group_ids = [
            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
            30, 31, 32, 33
        ]

        df = get_model_results('epi',
                               model_version_id=model_version_id,
                               age_group_id=age_group_ids,
                               measure_id=15)

        df = df[[
            'location_id', 'year_id', 'sex_id', 'age_group_id', 'mean',
            'lower', 'upper'
        ]].copy()
        print(df.head(5))

        df = self.drop_zeros_nulls(df, "mean", "lower", "upper")
        df = df.merge(id_template_df,
                      on=["location_id", "year_id", "age_group_id", "sex_id"])

        if df.empty:
            raise NoNonZeroValues

        df = self.calc_se_from_ui(df, "mean", "lower", "upper")
        df = self.calc_aggregate_se(df, self._data_key, "mean", "se")
        df = df.set_index(self._data_key)
        return df
Beispiel #2
0
 def __get_model__(self, pass_id):
     self.results = get_model_results('epi', model_version_id=pass_id)
     self.results['se'] = (self.results['mean'] -
                           self.results['lower']) / 1.96
     # only vars I care aboult
     self.results = self.results[[
         'location_id', 'age_group_id', 'sex_id', 'mean', 'se', 'year_id'
     ]]
Beispiel #3
0
 def _get_results(self):
     df = get_model_results(self.gbd_team,
                            gbd_id=self.gbd_id,
                            gbd_round_id=self.metadata.gbd_round_id,
                            model_version_id=self.model_version_id,
                            location_id=self.metadata.location_ids,
                            year_id=self.metadata.year_id)
     assert not df.empty, "No round 5 data found for this model."
     return df
def enhanced_get_model_results(model_version_id, measure_id=5):
    get_age_groups = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
                      22,30,31,32,33,235]
    # Get the sex specific results
    results_by_sex = get_model_results('epi',
                                       model_version_id=model_version_id,
                                       measure_id=measure_id,
                                       age_group_id=get_age_groups,
                                       location_set_id=22)
    # Get the both-sex results
    both_sex_results = both_sex_model_results(results_by_sex)
    results_combined = pd.concat([results_by_sex,both_sex_results])
    return results_combined    
def main():
    gbd_team, mvid, measure_id, age_group_id, path = parse_arguments()
    if path[-1] != '/':
        path += '/'

    location_df, location_id_list = generate_location_data()
    covariate_df = generate_covariate_data()

    if not age_group_id:
        if gbd_team == 'cod':
            age_group_id = -1
        else:
            age_group_id = 27

    model_results = get_model_results(gbd_team,
                                      model_version_id=mvid,
                                      age_group_id=age_group_id,
                                      measure_id=measure_id,
                                      gbd_round_id=4,
                                      year_id=2016,
                                      location_id=location_id_list)

    # Check dataframe for information
    assert not model_results.iloc[:, 0].empty, 'No gbd round 4 data found for this model version or age group id'

    label_df = query_cause_data(mvid, gbd_team)

    loc_cov_df = merge_dataframes(location_df, covariate_df)

    if gbd_team == 'cod':
        model_results = generate_asr(
            model_results[['location_id', 'year_id', 'age_group_id', 'sex_id', 'mean_death_rate']],
            ['location_id', 'year_id', 'age_group_id', 'sex_id'], ['mean_death_rate'],
            get_age_weights()
        )

        sex = get_unique_values(model_results, 'sex_id')[0]
        sex = 'males' if sex == 1 else 'females'
        measure = 'deaths'
        cause = label_df.iloc[0, 1]

        cod_df = merge_dataframes(loc_cov_df, model_results)
        cod_df = cod_df.iloc[:, [1, 2, 3, 7]].copy(deep=True)
        cod_df[['death_rate_x_100000']] = cod_df[['mean_death_rate']] * 100000
        cod_df.drop(labels='mean_death_rate', axis=1, inplace=True)

        len_error = 'Model version id {} does not return results for all 195 \
                     countries'.format(mvid)
        if len(cod_df) != 195:
            assert len_error

        graph_data(cod_df, cause, sex, measure)
        name_str = output_pdf(mvid, cause, sex, measure, age_group_id, path)

    else:
        cause = label_df.iloc[0, 1]
        sex = ['males', 'females']

        male_epi_results = model_results[model_results.sex_id == 1]
        epi_xy_df = merge_dataframes(loc_cov_df, male_epi_results)
        epi_xy_df = prune_epi_dataframe(epi_xy_df)

        female_epi_results = model_results[model_results.sex_id == 2]
        epi_xx_df = merge_dataframes(loc_cov_df, female_epi_results)
        epi_xx_df = prune_epi_dataframe(epi_xx_df)

        if measure_id == 5:
            measure = 'prevalence'
        elif measure_id == 6:
            measure = 'incidence'
        elif measure_id == 18:
            measure = 'proportion'

        if len(epi_xx_df) != 195 or len(epi_xy_df) != 195:
            assert len_error

        graph_data(epi_xy_df, cause, sex[0], measure)
        epi_xy_name_str = output_pdf(mvid, cause, sex[0], measure, age_group_id, path)

        graph_data(epi_xx_df, cause, sex[1], measure)
        epi_xx_name_str = output_pdf(mvid, cause, sex[1], measure, age_group_id, path)

    plt.clf()
    generate_legend(path)

    print 'Success!\n-------------'
    if gbd_team == 'epi':
        print 'File name: {}'.format(epi_xy_name_str)
        print 'File name: {}'.format(epi_xx_name_str)
    else:
        print 'File name: {}'.format(name_str)
    print 'File location: {}'.format(path)
    print '-------------'
    print 'legend.pdf also saved to {}'.format(path)
Beispiel #6
0
def hiv_adjust(unadjusted_df,
               ind_grouping,
               identifiers_list,
               hiv_rr=hiv_adjustment_rr):
    # Get all unique locations that we have data for
    unique_locs = ind_grouping.location_id.unique().tolist()
    age_groups = [
        1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 31, 32,
        33
    ]

    # This program only considers individuals with a CD4 count < 200
    hiv_meid = 9322
    # Get relevant HIV prevalence for all age groups
    hiv_prev = get_model_results('epi',
                                 gbd_id=hiv_meid,
                                 measure_id=5,
                                 location_id=unique_locs,
                                 age_group_id=age_groups,
                                 sex_id=[1, 2],
                                 status='best',
                                 year_id=-1)
    # Subset to useful columns only
    hiv_prev = hiv_prev[[
        'year_id', 'age_group_id', 'location_id', 'sex_id', 'mean'
    ]].copy()
    hiv_prev = hiv_prev.rename(columns={'mean': 'hiv_prev'})

    # Get populations for all listed locations
    gbd_years = hiv_prev.year_id.unique().tolist()
    pops = get_population(age_group_id=age_groups,
                          location_id=unique_locs,
                          year_id=gbd_years,
                          sex_id=[1, 2])
    # Subset to useful columns only
    pops = pops[[
        'year_id', 'age_group_id', 'location_id', 'sex_id', 'population'
    ]]

    # Join the two datasets
    join_on = ['year_id', 'age_group_id', 'location_id', 'sex_id']
    hiv_prev_pops = pd.merge(left=hiv_prev,
                             right=pops,
                             how='inner',
                             on=join_on)
    # Add a sex_id = 3 column by merging
    male_only = hiv_prev_pops[hiv_prev_pops['sex_id'] == 1].copy()
    female_only = hiv_prev_pops[hiv_prev_pops['sex_id'] == 2].copy()
    both_sexes = pd.merge(left=male_only,
                          right=female_only,
                          on=['age_group_id', 'location_id', 'year_id'],
                          suffixes=('_male', '_female'),
                          how='inner')
    both_sexes['population'] = (both_sexes['population_male'] +
                                both_sexes['population_female'])
    both_sexes['hiv_prev'] = (
        (both_sexes['hiv_prev_male'] * both_sexes['population_male'] +
         both_sexes['hiv_prev_female'] * both_sexes['population_female']) /
        both_sexes['population'])
    both_sexes.drop(labels=[
        'hiv_prev_male', 'hiv_prev_female', 'population_male',
        'population_female', 'sex_id_male', 'sex_id_female'
    ],
                    axis=1,
                    inplace=1)
    both_sexes['sex_id'] = 3
    hiv_prev_pops = pd.concat([hiv_prev_pops, both_sexes])
    hiv_prev_pops['sex_id'] = hiv_prev_pops['sex_id'].apply(float)
    # Get the upper and lower age groups from the GBD database
    q = """SELECT
               age_group_id,age_group_years_start,age_group_years_end
           FROM
               shared.age_group
           WHERE
               age_group_id IN {}""".format(tuple(age_groups))
    # For the next line, you'll need an 'epi' definition in your .ODBC file
    age_groups_df = query(q, conn_def='epi')
    # Merge back onto the HIV/population df
    hiv_prev_pops = pd.merge(left=hiv_prev_pops,
                             right=age_groups_df,
                             on='age_group_id')

    # Get the middle year to join on
    ind_grouping['year_start'] = ind_grouping['year_start'].apply(float)
    ind_grouping['year_end'] = ind_grouping['year_end'].apply(float)

    ind_grouping['year_id'] = np.round(
        ((ind_grouping['year_start'] + ind_grouping['year_end']) / 2), 0)
    ind_grouping['year_id'] = (ind_grouping['year_id'].apply(int).apply(
        lambda x: 1980 if x < 1980 else x))

    # The HIV adjustment calculation should be done only for people with 0mm indurations
    ind_grouping = ind_grouping.loc[ind_grouping['ind_bin_high'] < .1]

    # Get the sex_id from the sex
    sex_id_dict = {
        'Male': 1,
        'male': 1,
        'Female': 2,
        'female': 2,
        'Both': 3,
        'both': 3
    }
    ind_grouping['sex_id'] = ind_grouping['sex'].apply(
        lambda x: sex_id_dict[x])

    # Now, merge with the HIV population data on sex, location, and year (NOT age)

    joined = pd.merge(left=ind_grouping,
                      right=hiv_prev_pops,
                      on=['sex_id', 'location_id', 'year_id'],
                      how='inner')

    # Select only columns where the GBD age group range and the data age group
    #  range intersect
    # First, set age_start and age_end back to floats
    joined['age_start'] = joined['age_start'].apply(float)
    joined['age_end'] = joined['age_end'].apply(float)

    joined = joined[(joined['age_group_years_start'] <= joined['age_end'])
                    & (joined['age_group_years_end'] > joined['age_start'])]
    # Create updated age group categories to fit the actual age-start and age-end
    joined['age_group_start_adj'] = joined.apply(
        lambda x: np.max([x['age_group_years_start'], x['age_start']]), axis=1)
    # Subtract 1 from age_group_years_end to reflect our use of demographer notation
    #  (that is, using age 4 to represent 4 years, 0 days to 4 years, 364.99.. days)
    joined['age_group_end_adj'] = joined.apply(
        lambda x: min([x['age_group_years_end'] - 1, x['age_end']]), axis=1)
    # Create updated population count reflecting the fraction of the age group
    #  actually contained within the range
    # Again, the +1 reflects differences with the GBD age range due to demographer notation
    joined['pop_adj'] = (
        joined['population'] *
        (joined['age_group_end_adj'] + 1 - joined['age_group_start_adj']) /
        (joined['age_group_years_end'] - joined['age_group_years_start']))
    joined['hiv_prev_count'] = joined['hiv_prev'] * joined['pop_adj']
    # Subset only to identifiers + pop_adj and hiv_prev_count
    group_identifiers = ['group_id']
    to_group = joined[
        group_identifiers +
        ['hiv_prev_count', 'pop_adj', 'cases_proportional']].copy()
    # Now, group by identifiers and sum hiv_prev_count (num.) and pop_adj (denom.)
    summed = to_group.groupby(by=group_identifiers).sum().reset_index(
        drop=False)
    # Divide combined numerator by combined denominator to get total prevalence
    summed[
        'hiv_prev_weighted_avg'] = summed['hiv_prev_count'] / summed['pop_adj']
    summed = summed.drop(labels=['hiv_prev_count', 'pop_adj'], axis=1)

    # Create the adjustement:
    # Adjustment = HIV prevalence in this population * proportion of 0mm in study * proportion of HIV patients
    #  who return 0mm results when they actually have latent TB
    summed['hiv_adjustment'] = summed['cases_proportional'] * summed[
        'hiv_prev_weighted_avg'] * hiv_rr
    summed = summed[['group_id', 'hiv_adjustment']]
    summed['group_id'] = summed['group_id'].apply(lambda x: int(float(x)))
    unadjusted_df['group_id'] = unadjusted_df['group_id'].apply(
        lambda x: int(float(x)))
    # Merge onto the results df, using group_id as the unique identifier
    adjusted_df = pd.merge(left=unadjusted_df,
                           right=summed,
                           on=['group_id'],
                           how='left')
    # Fill any NaNs
    adjusted_df['hiv_adjustment'] = adjusted_df['hiv_adjustment'].fillna(0)
    # Add the adjustment to the mean, lower, and upper
    for i in ['mean', 'lower', 'upper']:
        adjusted_df[i] = adjusted_df[i] + adjusted_df['hiv_adjustment']

    # Drop the adjustment column and return
    adjusted_df = adjusted_df.drop(labels=['hiv_adjustment'], axis=1)
    return adjusted_df
Beispiel #7
0
    def get_dismod_model(self):

        self.dismod_model = get_model_results(QUERY)
Beispiel #8
0
 def get_dismod_model(self):
     
     self.dismod_model = get_model_results('epi', model_version_id=self.dismod_model_num, 
                                           location_id=self.df.location_id.unique().tolist(),
                                            year_id=-1, sex_id=[1,2], age_group_id=-1)
Beispiel #9
0
def combined_get_model_results(gbd_id=None, 
                                location_id='all', 
                                prev_filepath=None, 
                                inc_filepath=None, 
                                model_version_id=263738):
    age_ids = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,30,31,32,235]
    year_ids = [1990,1995,2000,2005,2010,2017]
    sex_ids = [1,2]

    #get incidence and prevalence data
    if (prev_filepath):
        print("Using file for prev")
        prev = pd.read_excel(prev_filepath)
        # get excel
    else:
        print("querying get_model_results for prev...")
        prev = get_model_results('epi', 
                                gbd_id=gbd_id, 
                                measure_id=5, 
                                location_id='all', 
                                year_id=year_ids, 
                                age_group_id=age_ids, 
                                sex_id=sex_ids, 
                                status='best', 
                                gbd_round_id=4)
    
    if (inc_filepath):
        print("Using file for inc")
        inc = pd.read_excel(inc_filepath)
        # get excel
    else:
        print("querying get_model_results for inc...")
        inc = get_model_results('epi', 
                                gbd_id=gbd_id, 
                                measure_id=6, 
                                location_id='all', 
                                year_id=year_ids, 
                                age_group_id=age_ids, 
                                sex_id=sex_ids, 
                                status='best', 
                                gbd_round_id=4)
        
    #prev['prev_se'] = (prev["upper"] - prev["lower"]) / (2*1.96)
    #inc['inc_se'] = (inc["upper"] - inc["lower"]) / (2*1.96)
    prev = prev.rename(columns={'mean':'prev_mean', 
                                'lower':'prev_lower', 
                                'upper':'prev_upper',
                                'standard_error':'prev_se'})
    inc = inc.rename(columns={'mean':'inc_mean', 
                            'lower':'inc_lower', 
                            'upper':'inc_upper',
                            'standard_error':'inc_se'})
    #prev = adj_data_template(df=prev)
    #inc = adj_data_template(df=inc)

    #load custom (HIV-neg + HIV-pos) csmr
    print("loading custom csmr data...")
    csmr = pd.read_csv("FILEPATH")
    #csmr['csmr_se'] = (csmr["upper"] - csmr["lower"]) / (2*1.96)
    csmr = csmr.rename(columns={'mean':'csmr_mean', 
                                'lower':'csmr_lower', 
                                'upper':'csmr_upper',
                                'standard_error':'csmr_se'})
    csmr = csmr[['age_group_id', 
                'location_id', 
                'year_id', 
                'sex_id', 
                'csmr_mean', 
                'csmr_se', 
                'csmr_lower', 
                'csmr_upper']].copy()
    
    #get acmr data
    print("querying get_envelope for acmr...")
    acmr = get_envelope(age_group_id=age_ids, 
                        location_id='all', 
                        year_id=year_ids, 
                        sex_id=sex_ids, 
                        gbd_round_id=5, 
                        with_shock=1, 
                        with_hiv=1, 
                        rates=1)
    acmr['acmr_se'] = (acmr["upper"] - acmr["lower"]) / (2*1.96)
    acmr = acmr.rename(columns={'mean':'acmr_mean', 
                                'lower':'acmr_lower', 
                                'upper':'acmr_upper'})
    
    #get remission data
    #remission should equal 2. upper and lower bounds 1.8-2.2

    #get emr-predicted data
    emrpred = get_emr_pred(model_version_id)

    merge_inc = pd.merge(left=inc, 
                        right=csmr, 
                        on=['age_group_id', 'sex_id', 'year_id', 'location_id'], 
                        how='left')

    merge_inc = pd.merge(left=merge_inc, 
                        right=acmr, 
                        on=['age_group_id', 'sex_id', 'year_id', 'location_id'], 
                        how='left')

    merge_inc = pd.merge(left=merge_inc, 
                        right=emrpred, 
                        on=['age_group_id', 'sex_id', 'year_id'], 
                        how='left')

    merge_inc['rem_mean'] = 2
    merge_inc['rem_se'] = .1020408
    merge_inc = merge_inc.rename(columns={'location_id_x':'location_id'})
    #merge data required for incidence-based emr calculation
    merge_prev = pd.merge(left=prev, 
                        right=csmr, 
                        on=['age_group_id', 'sex_id', 'year_id', 'location_id'], 
                        how='left')
    #merge data required for prevalence-based emr calculation
    return (merge_prev, merge_inc)