Esempio n. 1
0
def prep_cause_to_weight_cause_map(cause_set_id,
                                   gbd_round_id,
                                   weight_causes,
                                   level_of_analysis='cause_id'):
    """Get the right distribution to use based on those available.

    Defaults to most detailed parent cause of each cause id in the given
    hierarchy that is in the weight casues list, unless specific exceptions
    are coded.
    """

    if level_of_analysis == 'cause_id':

        weight_cause_map = prep_child_to_available_parent_map(
            cause_set_id, gbd_round_id, weight_causes)
        weight_cause_map = weight_cause_map.rename(
            columns={'parent_cause_id': 'weight_cause_id'})

        causes = db_queries.get_cause_metadata(cause_set_id=cause_set_id,
                                               gbd_round_id=gbd_round_id)
        acauses = causes[['cause_id',
                          'acause']].set_index('cause_id').to_dict()['acause']

        paths = causes[[
            'cause_id', 'path_to_top_parent'
        ]].set_index('cause_id').to_dict()['path_to_top_parent']
        weight_cause_map['path_to_top_parent'] = \
            weight_cause_map['cause_id'].map(paths)

        weight_cause_map.loc[weight_cause_map['cause_id'] == 843,
                             'weight_cause_id'] = 344

        weight_cause_map.loc[weight_cause_map['cause_id'].isin([743, 919]),
                             'weight_cause_id'] = 294

        weight_cause_map.loc[
            weight_cause_map['path_to_top_parent'].str.contains(',366,'),
            'weight_cause_id'] = 366

        weight_cause_map.loc[
            weight_cause_map['cause_id'].isin([855, 854, 851]),
            'weight_cause_id'] = 730

        weight_cause_map.loc[weight_cause_map['cause_id'] == 940,
                             'weight_cause_id'] = 716
        weight_cause_map['acause'] = weight_cause_map['cause_id'].map(acauses)
        weight_cause_map['weight_acause'] = \
            weight_cause_map['weight_cause_id'].map(acauses)
    else:

        weight_cause_map = pd.DataFrame({
            level_of_analysis:
            weight_causes,
            'weight_{}'.format(level_of_analysis):
            weight_causes
        })

    return weight_cause_map[[
        level_of_analysis, 'weight_{}'.format(level_of_analysis)
    ]]
Esempio n. 2
0
def split_out_by_cause_type(df):
    causes = get_cause_metadata(cause_set_id=4)

    original_shape = df.copy().shape[0]
    original_deaths = df.copy()['best'].sum()

    war_causes = [945]
    war_shock_causes = list(
        causes[causes['parent_id'].isin(war_causes)]['cause_id'])
    war_df = df[df['cause_id'].isin(war_shock_causes + war_causes)]

    codem_causes_no_detail = [302, 345, 408, 703]
    codem_shock_causes = list(
        causes[causes['parent_id'].isin(codem_causes_no_detail)]['cause_id'])
    codem_shock_causes += [
        335, 357, 387, 695, 699, 703, 707, 711, 727, 842, 854, 724, 689, 341,
        693
    ]
    codem_df = df[df['cause_id'].isin(codem_shock_causes +
                                      codem_causes_no_detail)]

    non_codem_causes = [729]
    non_codem_shock_causes = list(
        causes[causes['parent_id'].isin(non_codem_causes)]['cause_id'])
    non_codem_df = df[df['cause_id'].isin(non_codem_shock_causes +
                                          non_codem_causes)]

    assert original_shape == (war_df.shape[0] + codem_df.shape[0] +
                              non_codem_df.shape[0])
    assert np.isclose(
        original_deaths,
        (war_df.best.sum() + codem_df.best.sum() + non_codem_df.best.sum()))

    return war_df, codem_df, non_codem_df
Esempio n. 3
0
def select_maternal_data(df):
    """
    Function that filters out non maternal data. Meant to be ran at the start
    of this process.  If we are only interested in adjusting the maternal denom,
    then we don't need non maternal data.

    Parameters:
        df: Pandas DataFrame
            Must have 'bundle_id' as a column
    """

    assert "bundle_id" in df.columns, "'bundle_id' must be a column."

    # get causes
    causes_5 = get_cause_metadata(
        cause_set_id=9,
        gbd_round_id=5)  # round 5 was updated to keep bundles 79 and 646
    causes_4 = get_cause_metadata(cause_set_id=9, gbd_round_id=4)

    causes = pd.concat([causes_4, causes_5])
    causes.drop_duplicates(inplace=True)

    # create condiational mask that selects maternal causes
    condition = causes.path_to_top_parent.str.contains("366")

    # subset just causes that meet the condition sdf
    maternal_causes = causes[condition]

    # make list of maternal causes
    maternal_list = list(maternal_causes['cause_id'].unique())

    # get bundle to cause map
    bundle_cause = query("QUERY", conn_def=DATABASE)

    # merge cause_id onto data
    df = df.merge(bundle_cause, how='left', on='bundle_id')

    # keep only maternal causes in df
    df = df[df['cause_id'].isin(maternal_list)]

    # drop cause_id
    df.drop('cause_id', axis=1, inplace=True)

    # drop the denominator bundle
    df = df[df['bundle_id'] != 1010]

    return df
Esempio n. 4
0
 def _get_hierarchy(self) -> pd.DataFrame:
     hierarchy_cols = [
         constants.Columns.CAUSE_ID, constants.Columns.ACAUSE,
         constants.Columns.LEVEL, constants.Columns.PARENT_ID,
         constants.Columns.SORT_ORDER, constants.Columns.MOST_DETAILED,
         constants.Columns.IS_ESTIMATE
     ]
     hierarchy = get_cause_metadata(
         cause_set_version_id=self.set_version_id,
         gbd_round_id=self.gbd_round_id)
     self._validate_hierachy(hierarchy)
     return hierarchy[hierarchy_cols]
Esempio n. 5
0
def get_cause_ids(cause_set):
    ##########################################
    #Fetches list of source and target causes
    #given the oldCorrect cause set
    ##########################################
    cause_df = get_cause_metadata(cause_set)

    detail_bool = cause_df['most_detailed'] == 1
    sources_bool = (cause_df['parent_id'] == 952) & detail_bool
    targets_bool = (cause_df['parent_id'] == 953) & detail_bool
    
    sources = cause_df.loc[sources_bool, 'cause_id'].unique().tolist()
    targets = cause_df.loc[targets_bool, 'cause_id'].unique().tolist()    
    return sources, targets
Esempio n. 6
0
def run_cod_age_sex_splitting(db):
    # CHECK COMPLETENESS
    cause_set_version = 269
    cm = get_cause_metadata(cause_set_version_id=cause_set_version)
    possible_causes = cm['cause_id'].unique().tolist()
    for cause_id in db['cause_id'].unique().tolist():
        assert cause_id in possible_causes, "Cause ID {} not in hierarchy".format(
            cause_id)
    loc_meta = get_location_metadata(gbd_round_id=5, location_set_id=21)
    possible_locs = loc_meta['location_id'].tolist()
    db = db.loc[db['location_id'].isin(possible_locs), :]
    db = db.loc[db['best'] > 0, :]
    db['hi_best_ratio'] = db['high'] / db['best']
    db['lo_best_ratio'] = db['low'] / db['best']
    db = db.reset_index(drop=True)
    db['unique_join'] = db.index
    db_merge_later = db.loc[:,
                            ['unique_join', 'hi_best_ratio', 'lo_best_ratio']]
    db = db.drop(labels=['high', 'low', 'hi_best_ratio', 'lo_best_ratio'],
                 axis=1)
    id_cols = [
        i for i in db.columns if i not in ['best', 'age_group_id', 'sex_id']
    ]
    cause_set_version_id = query("""SELECT cause_set_version_id
                                    FROM ADDRESS
                                    WHERE gbd_round_id=5 AND cause_set_id=4;""",
                                 conn_def='epi').iloc[0, 0]
    pop_run_id = get_population(gbd_round_id=5,
                                status="recent")['run_id'].iloc[0]
    splitter = AgeSexSplitter(cause_set_version_id=cause_set_version,
                              pop_run_id=104,
                              distribution_set_version_id=29,
                              id_cols=['unique_join'],
                              value_column='best')
    split_db = splitter.get_computed_dataframe(df=db,
                                               location_meta_df=loc_meta)
    split_db = pd.merge(left=split_db,
                        right=db_merge_later,
                        on=['unique_join'],
                        how='left')
    split_db['low'] = split_db['best'] * split_db['lo_best_ratio']
    split_db['high'] = split_db['best'] * split_db['hi_best_ratio']
    split_db = split_db.drop(
        labels=['unique_join', 'lo_best_ratio', 'hi_best_ratio'], axis=1)
    return split_db
Esempio n. 7
0
def format_gbd_results(int_cause):

    rd = pd.read_csv(
        f"/ihme/cod/prep/mcod/process_data/{int_cause}/rdp/2019_03_07/redistributed_deaths.csv"
    )
    rd[[x for x in list(rd)
        if "inj" in x]] = rd[[x for x in list(rd) if "inj" in x]].fillna(0)
    rd = rd.groupby(['location_id', 'sex_id', 'year_id', 'age_group_id'],
                    as_index=False)[[x for x in list(rd) if "inj" in x]].sum()
    rd = pd.melt(rd,
                 id_vars=['location_id', 'sex_id', 'year_id', 'age_group_id'],
                 value_vars=[x for x in list(rd) if "inj" in x],
                 var_name="acause",
                 value_name=int_cause)
    rd = rd.loc[rd[f"{int_cause}"] != 0]

    causes = get_cause_metadata(gbd_round_id=6, cause_set_id=3)
    injuries = causes.loc[(causes.acause.str.contains("inj"))
                          & (causes.most_detailed == 1)]
    inj_dict = injuries.set_index("acause")["cause_id"].to_dict()
    rd["cause_id"] = rd[["acause"]].apply(lambda x: x.map(inj_dict))

    restricted_targets = [729, 945]
    # should have been dropped last year (not most detailed/is yld only)
    restricted_targets += [704, 941]
    # x59 only unintentional
    if int_cause == "x59":
        restricted_targets += [721, 723, 725, 726, 727, 854, 941]
    rd = rd.loc[~(rd["cause_id"].isin(restricted_targets))]
    rd = get_country_names(rd)
    # make this right after dropping restricted targets
    rd = rd.groupby(
        ['location_id', 'sex_id', 'year_id', 'age_group_id', 'cause_id'],
        as_index=False)[f"{int_cause}"].sum()
    rd["prop"] = rd.groupby(
        ["age_group_id", "sex_id", "location_id", "year_id"],
        as_index=False)[f"{int_cause}"].transform(
            lambda x: x / float(x.sum(axis=0)))

    return rd
Esempio n. 8
0
 def get_cause_ids(self, cause_level):
     """
     Returns a list of cause ids at a certain level of cause hierarchy,
     or, alternatively, all most detailed causes.
     In order for cause decomposition to work properly these causes
     must satisfy the classic GBD 'mutually exclusive and collectively
     exhaustive' rules for cause lists.
     """
     ch = get_cause_metadata(cause_set_id=self.cause_set_id,
                             gbd_round_id=self.gbd_round_id,
                             decomp_step=self.decomp_step)
     validations.validate_cause_level(cause_level, ch)
     if cause_level == "most_detailed":
         cause_ids = ch.loc[ch[cause_level] == 1,
                            "cause_id"].unique().tolist()
     else:
         cause_ids = ch[((ch["level"] == cause_level) | (
             (ch["level"] < cause_level) &
             (ch["most_detailed"] == 1)))]["cause_id"].unique().tolist()
     print(f"Found {len(cause_ids)} cause_ids, cause_set_id "
           f"{self.cause_set_id} at cause level {cause_level}")
     return cause_ids
Esempio n. 9
0
def run_cod_age_sex_splitting(df, conn_def, cause_set_version_id, pop_run_id):
    cause_metadata = get_cause_metadata(
        cause_set_version_id=cause_set_version_id)
    possible_causes = cause_metadata['cause_id'].unique().tolist()
    for cause_id in df['cause_id'].unique().tolist():
        assert cause_id in possible_causes, "Cause ID {} not in hierarchy".format(
            cause_id)
    loc_meta = get_location_metadata(gbd_round_id=6, location_set_id=21)
    possible_locs = loc_meta['location_id'].tolist()
    df = df.loc[df['location_id'].isin(possible_locs), :]
    df = df.loc[df['best'] > 0, :]
    df['hi_best_ratio'] = df['high'] / df['best']
    df['lo_best_ratio'] = df['low'] / df['best']

    df = df.reset_index(drop=True)
    df['unique_join'] = df.index
    df_merge_later = df.loc[:,
                            ['unique_join', 'hi_best_ratio', 'lo_best_ratio']]
    df = df.drop(labels=['high', 'low', 'hi_best_ratio', 'lo_best_ratio'],
                 axis=1)
    splitter = AgeSexSplitter(cause_set_version_id=cause_set_version_id,
                              pop_run_id=pop_run_id,
                              distribution_set_version_id=62,
                              id_cols=['unique_join'],
                              value_column='best')
    split_df = splitter.get_computed_dataframe(df=df,
                                               location_meta_df=loc_meta)
    split_df = pd.merge(left=split_df,
                        right=df_merge_later,
                        on=['unique_join'],
                        how='left')
    split_df['low'] = split_df['best'] * split_df['lo_best_ratio']
    split_df['high'] = split_df['best'] * split_df['hi_best_ratio']
    split_df = split_df.drop(
        labels=['unique_join', 'lo_best_ratio', 'hi_best_ratio'], axis=1)
    return split_df
Esempio n. 10
0
def fix_maternal_denominators(df, return_only_maternal=False):

    asfr = get_covariate_estimates(QUERY)

    # keep age/location/year and the critical mean_value
    asfr = asfr[['location_id', 'year_id', 'age_group_id', 'sex_id',
                 'mean_value']]
    asfr.drop_duplicates(inplace=True)

    # map age_start and age_end onto asfr
    age_group = query("QUERY")
    pre_asfr = asfr.shape[0]
    asfr = asfr.merge(age_group, how='left', on='age_group_id')
    assert pre_asfr == asfr.shape[0],\
    "The merge duplicated rows unexpectedly"
    asfr.drop('age_group_id', axis=1, inplace=True)
    asfr.rename(columns={'age_group_years_start': 'age_start',
                         'age_group_years_end': 'age_end'},
                inplace=True)
    # create year_start and year_end
    asfr['year_start'] = asfr['year_id']
    asfr['year_end'] = asfr['year_id']
    asfr.drop('year_id', axis=1, inplace=True)

    # all the mean_values in asfr where age_end is less than one are 0, so we
    # can make up an asfr group for age start = 0 and age_end = 1
    asfr.loc[asfr['age_end'] < 1, 'age_end'] = 1
    asfr.loc[asfr['age_start'] < 1, 'age_start'] = 0

    asfr.loc[asfr['age_end'] > 1, 'age_end'] = asfr.loc[asfr['age_end'] > 1,
             'age_end'] - 1

    # one more change, asfr has the max age end as 125 (now 124), and we want
    # it to be 99
    asfr.loc[asfr['age_end'] == 124, 'age_end'] = 99  # now asfr age_start
    # and age_end match our hospital data

    # and incase we created duplicated rows by doing this:
    asfr.drop_duplicates(inplace=True)

    # MERGE ASFR ONTO HOSP
    pre_shape = df.shape[0]
    df = df.merge(asfr, how='left', on=['age_start', 'age_end', 'year_start',
                                        'year_end', 'location_id', 'sex_id'])
    assert df.mean_value.isnull().sum() != df.shape[0],\
    "The merge failed to attach any mean_values"
    assert pre_shape == df.shape[0],\
    "The merge duplicated rows unexpectedly"

    # GET MATERNAL CAUSES
    # query causes
    causes = get_cause_metadata(QUERY)
    condition = causes.path_to_top_parent.str.contains("366")
    
    maternal_causes = causes[condition]

    # make list of maternal causes
    maternal_list = list(maternal_causes['cause_id'].unique())

    maternal_df = df[df['cause_id'].isin(maternal_list)]  # subset out rows that
    # are in maternal list
    assert maternal_df.shape[0] != 0,\
    "The maternal dataframe is empty"

    df = df[~df['cause_id'].isin(maternal_list)]  # subset out rows that
    # are not in the maternal list
    assert df.shape[0] != 0,\
    "The hospital dataframe is empty"
    for cause in maternal_list:
        
        maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] =\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] /\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value']
        maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] =\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] /\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value']
        maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] =\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] /\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value']
        # some mean_valued were zero, this is effectively an age/sex restriction
        # assign these a rate of 0
        maternal_df.loc[(maternal_df['product'].isnull()) & (maternal_df['cause_id'] == cause), ['product', 'upper_product', 'lower_product']] = 0

        # assign infinite values to 0
        maternal_df.loc[(np.isinf(maternal_df['product'])) & (maternal_df['cause_id'] == cause), ['product', 'upper_product', 'lower_product']] = 0


    if return_only_maternal == True:
        maternal_df.drop(['mean_value', 'cause_id'], axis=1, inplace=True)
        return(maternal_df)
    else:
        df = pd.concat([df, maternal_df])  # bring data back together

        # DROP ASFR info
        df.drop(['mean_value', 'cause_id'], axis=1, inplace=True)

        return(df)
Esempio n. 11
0
def write_bundles(df,
                  write_location="test",
                  write_fixed_maternal=False,
                  extra_filename=""):

    assert write_location == 'test' or write_location == 'work', (
        "parameter "
        "write_location needs to be either 'test' or 'write', "
        "you put {}").format(write_location)

    if write_location == 'work':
        warnings.warn("""
                      write_location is set to work.
                      """)
        time.sleep(5)

    # drop bundle 'total_maternal', we don't want to write it
    df = df[df.bundle_id != 1010]

    # get injuries bundle_ids so we can keep injury corrected data later
    pc_injuries = pd.read_csv("FILEPATH")
    inj_bid_list = pc_injuries['Level1-Bundle ID'].unique()

    # CAUSE INFORMATION
    # get cause_id so we can write to an acause
    # have to go through cause_id to get to a relationship between BID &
    # acause
    cause_id_info = query("QUERY")
    # get acause
    acause_info = query("QUERY")
    # merge acause, bid, cause_id info together
    acause_info = acause_info.merge(cause_id_info, how="left", on="cause_id")

    # REI INFORMATION
    # get rei_id so we can write to a rei
    rei_id_info = query("QUERY")
    # get rei
    rei_info = query("QUERY")
    # merge rei, bid, rei_id together into one dataframe
    rei_info = rei_info.merge(rei_id_info, how="left", on="rei_id")

    # COMBINE REI AND ACAUSE
    # rename acause to match
    acause_info.rename(columns={
        'cause_id': 'cause_rei_id',
        'acause': 'acause_rei'
    },
                       inplace=True)
    # rename rei to match
    rei_info.rename(columns={
        'rei_id': 'cause_rei_id',
        'rei': 'acause_rei'
    },
                    inplace=True)

    # concat rei and acause together
    folder_info = pd.concat([acause_info, rei_info])

    # drop rows that don't have bundle_ids
    folder_info = folder_info.dropna(subset=['bundle_id'])

    # drop cause_rei_id, because we don't need it for getting data into
    # folders
    folder_info.drop("cause_rei_id", axis=1, inplace=True)

    # drop duplicates, just in case there are any
    folder_info.drop_duplicates(inplace=True)

    # MERGE ACAUSE/REI COMBO COLUMN ONTO DATA BY BUNDLE ID
    # there are NO null acause_rei entries!
    df = df.merge(folder_info, how="left", on="bundle_id")

    if write_fixed_maternal:
        # this is basically just a double check that we're only writing
        # data for maternal causes
        # GET MATERNAL CAUSES
        causes = get_cause_metadata(cause_set_id=9)
        condition = causes.path_to_top_parent.str.contains("366")

        # subset just causes that meet the condition sdf
        maternal_causes = causes[condition]

        # make list of maternal causes
        maternal_list = list(maternal_causes['acause'].unique())

        # keep only maternal causes
        df = df[df['acause_rei'].isin(maternal_list)]
        # drop the denominator bundle
        df = df[df['bundle_id'] != 1010]

    start = time.time()
    bundle_ids = df['bundle_id'].unique()

    # prevalence, indicence should be lower case
    df['measure'] = df['measure'].str.lower()

    readme = pd.read_excel("FILEPATH")

    columns_before = df.columns

    ordered = [
        'seq', 'input_type', 'underlying_nid', 'nid', 'source_type',
        'bundle_id', 'bundle_name', 'location_id', 'location_name', 'sex',
        'year_start', 'year_end', 'age_start', 'age_end', 'measure', 'mean_0',
        'lower_0', 'upper_0', 'mean_1', 'lower_1', 'upper_1',
        'correction_factor_1', 'mean_2', 'lower_2', 'upper_2',
        'correction_factor_2', 'mean_3', 'lower_3', 'upper_3',
        'correction_factor_3', 'mean_inj', 'lower_inj', 'upper_inj',
        'correction_factor_inj', 'standard_error', 'cases',
        'effective_sample_size', 'sample_size', 'unit_type',
        'unit_value_as_published', 'uncertainty_type',
        'uncertainty_type_value', 'representative_name', 'urbanicity_type',
        'recall_type', 'recall_type_value', 'sampling_type', 'response_rate',
        'design_effect', 'extractor', 'is_outlier', 'acause_rei'
    ]
    df = df[ordered]
    columns_after = df.columns
    assert set(columns_after) == set(columns_before),\
        "the columns {} were added/lost while changing column order"\
        .format(set(columns_after).symmetric_difference(set(columns_before)))

    # adjust min age_end to 0.999 instead of 1
    df.loc[df['age_start'] == 0, 'age_end'] = 0.999

    print("BEGINNING WRITING, THE START TIME IS {}".format(
        time.strftime('%X %x %Z')))
    failed_bundles = []  # initialize empty list to append to in this for loop
    counter = 0  # initialize counter to report how close we are to done
    length = len(bundle_ids)
    for bundle in bundle_ids:
        counter += 1
        completeness = float(counter) / length * 100
        print r"{}% done".format(completeness)
        # subset bundle data
        df_b = df[df['bundle_id'] == bundle].copy()

        # drop columns based on measure - inc/prev/injury
        # if the measure is prev: keep all 3 correction factors
        # if measure is inc and not an inj: keep 2 correction factors
        # if measure is inc and an inj: keep only injury correction factor
        df_b_measure = df_b.measure.unique()[0]
        if df_b.bundle_id.isin(inj_bid_list).all():
            df_b.drop([
                'mean_1', 'upper_1', 'lower_1', 'correction_factor_1',
                'mean_2', 'upper_2', 'lower_2', 'correction_factor_2',
                'mean_3', 'upper_3', 'lower_3', 'correction_factor_3'
            ],
                      axis=1,
                      inplace=True)
        if ((df_b_measure == 'incidence')
                and not (df_b.bundle_id.isin(inj_bid_list).all())):

            df_b.drop([
                'mean_inj', 'upper_inj', 'lower_inj', 'correction_factor_inj'
            ],
                      axis=1,
                      inplace=True)
        if df_b_measure == 'prevalence':
            df_b.drop([
                'mean_inj', 'upper_inj', 'lower_inj', 'correction_factor_inj'
            ],
                      axis=1,
                      inplace=True)

        acause_rei = str(df_b.acause_rei.unique()[0])
        df_b.drop('acause_rei', axis=1, inplace=True)

        if write_location == 'test':
            writedir = ("FILEPATH")
        elif write_location == 'work':
            writedir = ("FILEPATH")

        if not os.path.isdir(writedir):
            os.makedirs(writedir)  # make the directory if it does not exist

        # write for modelers
        # make path
        vers_id = "v8"  # last one was v6, should have been v7
        date = datetime.datetime.today().strftime("%Y_%m_%d")  # YYYY-MM-DD
        if write_fixed_maternal:
            extra_filename = "_adjusted_denominator"
        bundle_path = "{}{}_{}_{}{}.xlsx".\
            format(writedir, int(bundle), vers_id, date, extra_filename)
        print r"Now writing at {}".format(bundle_path)

        # try to write to modelers' folders
        try:
            writer = pd.ExcelWriter(bundle_path, engine='xlsxwriter')
            df_b.to_excel(writer, sheet_name="extraction", index=False)
            readme.to_excel(writer, sheet_name='README', index=False)
            writer.save()
        except:
            failed_bundles.append(bundle)  # if it fails for any reason
            # make note of it
    end = time.time()

    text = open("FILEPATH")
    text.write("function: write_bundles " + "\n" + "start time: " +
               str(start) + "\n" + " end time: " + str(end) + "\n" +
               " run time: " + str((end - start) / 60.0) + " minutes")
    text.close()

    print("DONE WRITING, THE CURRENT TIME IS {}".format(
        time.strftime('%X %x %Z')))
    return (failed_bundles)
Esempio n. 12
0
def get_cause_hierarchy(cause_set_id=4): 
    ''' returns current cause hierarchy
    '''
    causes_df = get_cause_metadata(cause_set_version_id = cause_set_id)
    return causes_df 
Esempio n. 13
0
def fix_maternal_denominators(df, return_only_maternal=False):

    # At this point, data will have bundle_id and cause_id on it,
    #   but it has not been collapsed to those levels. it is at the
    #   baby seq level but as of 4-24-2017 data will be at bundle level
    # 2) aquired asfr from the database
    # 3) attach age_start and age_end to asfr, and create year_start and
    # year_end out of year_id
    # 4) attach asfr to the hospital data
    # 5) where cause_id is a maternal cause, do the division
    # 6) Then drop all the asfr info. namely, 'mean_value'

    # GET ASFR
    # has age/location/year
    asfr = get_covariate_estimates(covariate_id=13)

    # keep age/location/year and the critical mean_value
    asfr = asfr[[
        'location_id', 'year_id', 'age_group_id', 'sex_id', 'mean_value'
    ]]
    asfr.drop_duplicates(inplace=True)

    # map age_start and age_end onto asfr
    age_group = query("QUERY")
    pre_asfr = asfr.shape[0]
    asfr = asfr.merge(age_group, how='left', on='age_group_id')
    assert pre_asfr == asfr.shape[0],\
    "The merge duplicated rows unexpectedly"
    asfr.drop('age_group_id', axis=1, inplace=True)
    asfr.rename(columns={
        'age_group_years_start': 'age_start',
        'age_group_years_end': 'age_end'
    },
                inplace=True)
    # create year_start and year_end
    asfr['year_start'] = asfr['year_id']
    asfr['year_end'] = asfr['year_id']
    asfr.drop('year_id', axis=1, inplace=True)

    # The below commented out line of code was very wrong, asfr has three under
    # under one years old age groups, but our data is just 0-1 years old.
    # additionaly, this would turn age ends like 1 into zero, which is wrong.
    # asfr['age_end'] = asfr['age_end'] - 1

    # all the mean_values in asfr where age_end is less than one are 0, so we
    # can make up an asfr group for age start = 0 and age_end = 1
    asfr.loc[asfr['age_end'] < 1, 'age_end'] = 1
    asfr.loc[asfr['age_start'] < 1, 'age_start'] = 0

    # THIS IS SO IMPORTANT, our data has
    # age_end as 14, 19, 24, while asfr has age_end as 15, 20, 25 ...
    asfr.loc[asfr['age_end'] > 1,
             'age_end'] = asfr.loc[asfr['age_end'] > 1, 'age_end'] - 1

    # one more change, asfr has the max age end as 125 (now 124), and we want
    # it to be 99
    asfr.loc[asfr['age_end'] == 124, 'age_end'] = 99  # now asfr age_start
    # and age_end match our hospital data

    # and incase we created duplicated rows by doing this:
    asfr.drop_duplicates(inplace=True)

    # MERGE ASFR ONTO HOSP
    pre_shape = df.shape[0]
    df = df.merge(asfr,
                  how='left',
                  on=[
                      'age_start', 'age_end', 'year_start', 'year_end',
                      'location_id', 'sex_id'
                  ])
    assert df.mean_value.isnull().sum() != df.shape[0],\
    "The merge failed to attach any mean_values"
    assert pre_shape == df.shape[0],\
    "The merge duplicated rows unexpectedly"

    # GET MATERNAL CAUSES
    # query causes
    causes = get_cause_metadata(cause_set_id=9)
    condition = causes.path_to_top_parent.str.contains("366")  # 366 happens
    # to always be in the third level

    # subset just causes that meet the condition sdf
    maternal_causes = causes[condition]

    # make list of maternal causes
    maternal_list = list(maternal_causes['cause_id'].unique())

    # subset out parts of data that have asfr info
    # loop over cause_ids that are in maternal_list
    # divide 'mean' by 'mean_value' and overwrite mean, upper, or lower,
    # as relevant.
    maternal_df = df[df['cause_id'].isin(
        maternal_list)]  # subset out rows that
    # are in maternal list
    assert maternal_df.shape[0] != 0,\
    "The maternal dataframe is empty"

    df = df[~df['cause_id'].isin(maternal_list)]  # subset out rows that
    # are not in the maternal list
    assert df.shape[0] != 0,\
    "The hospital dataframe is empty"
    for cause in maternal_list:
        # the line breaks are weird looking but this is just assiging a value
        # to the result of division
        maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] =\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'product'] /\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value']
        maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] =\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'upper_product'] /\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value']
        maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] =\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'lower_product'] /\
            maternal_df.loc[maternal_df['cause_id'] == cause, 'mean_value']
        # some mean_valued were zero, this is effectively an age/sex restriction
        # assign these a rate of 0
        maternal_df.loc[(maternal_df['product'].isnull()) &
                        (maternal_df['cause_id'] == cause),
                        ['product', 'upper_product', 'lower_product']] = 0

        # assign infinite values to 0
        maternal_df.loc[(np.isinf(maternal_df['product'])) &
                        (maternal_df['cause_id'] == cause),
                        ['product', 'upper_product', 'lower_product']] = 0

    if return_only_maternal == True:
        maternal_df.drop(['mean_value', 'cause_id'], axis=1, inplace=True)
        return (maternal_df)
    else:
        df = pd.concat([df, maternal_df])  # bring data back together

        # DROP ASFR info
        df.drop(['mean_value', 'cause_id'], axis=1, inplace=True)

        return (df)
Esempio n. 14
0
def prep_child_to_available_parent_map(cause_set_id,
                                       gbd_round_id,
                                       available_cause_ids,
                                       as_dict=False):
    """Prep a mapping of cause_id to the most detailed available parent.

    For a given cause hierarchy, and a list of "available" causes, return
    a mapping from each cause in the hierarchy to the most detailed available
    cause that is "available" and in that cause's path_to_top_parent.
    "available": icg_id/cause_id are present in the weights
    dataframe

    Arguments:
        cause_set_id (int): from shared.cause_set in the database
        gbd_round_id (int): from shared.gbd_round in the database
            together, cause_set_id and gbd_round_id determine the
            active cause set version id to use from
            shared.cause_hierarchy_history
        available_cause_ids (list of ints): all must be cause ids in
            shared.cause
        as_dict (bool): If False, returns a dataframe instead of a dict

    Returns:
        cause_map (dict): a dictionary from cause_id to available_cause_id
        or, if as_dict == False: a dataframe ['cause_id', 'parent_cause_id']

    This function isn't used if the data that is being split is at the
    icg_id level.
    """
    causes = db_queries.get_cause_metadata(cause_set_id=cause_set_id,
                                           gbd_round_id=gbd_round_id)
    cause_levels = causes.path_to_top_parent.str.split(',').apply(pd.Series, 1)
    cause_tree = pd.concat(
        [causes[['cause_id', 'path_to_top_parent', 'level']], cause_levels],
        axis=1)
    cause_tree = cause_tree.drop(['path_to_top_parent', 'level'], axis=1)
    cause_tree = cause_tree.set_index(['cause_id']).stack().reset_index()
    cause_tree = cause_tree.rename(columns={
        'level_1': 'par_level',
        0: 'parent_cause_id'
    })
    cause_tree['parent_cause_id'] = cause_tree['parent_cause_id'].astype(int)

    cause_availability = {c: 1 for c in available_cause_ids}
    cause_tree['available'] = \
        cause_tree['parent_cause_id'].map(cause_availability).fillna(0)

    available_cause_map = cause_tree.query('available == 1')
    available_cause_map['max_level_available'] = \
        available_cause_map.groupby('cause_id')['par_level'].transform(max)
    available_cause_map = available_cause_map.query(
        'par_level == max_level_available')
    available_cause_map = available_cause_map[['cause_id', 'parent_cause_id']]
    assert not available_cause_map[['cause_id']].duplicated().any()
    missing = set(causes.cause_id) - set(available_cause_map['cause_id'])
    if len(missing) > 0:
        raise AssertionError(
            "Was not able to find parent in given available cause "
            "ids list for these cause ids: \n{}".format(missing))

    if as_dict:
        available_cause_map = available_cause_map.set_index('cause_id')
        available_cause_map = available_cause_map.to_dict()['parent_cause_id']
    return available_cause_map
def aggregate_to_bundle(df, write_maternal_denom=False,
                        adjust_maternal_denom=False):
    """
    Takes a dataframe aggregated to baby sequela level and returns
    the df aggregated to 5 year bands at the bundle level
    """
    ##############################
    # PULL POPULATION
    #
    # This is preparation for aggregating to 5 year bands
    ##############################

    # get rid of all corrected values and correction factors. we want to do
    # this because we are going to aggregate to 5 year bands and the bundle_id
    # level of analysis. We intend to makes cells where the correction factor
    # is over 50 NULL. it doesn't make sense to do that before aggregating,
    # because any rows with nulls in them would be lost entirely.  It doesn't
    # make sense to apply corrections and NOT delete cells that break the
    # over-50 rule before aggregation either cuz we would lose the correction
    # factors during the groupby. Also, it would not be robust to aggregate
    # a 16 columns. Therefore, it's best to apply corrections after aggregating.

    # Technically, the groupby would get rid of these for us. Honestly, this
    # just makes things easier to work with up to that point.
    df.drop(['mean_1', 'mean_2', 'mean_3', 'mean_inj',
             'upper_1', 'upper_2', 'upper_3', 'upper_inj',
             'lower_1', 'lower_2', 'lower_3', 'lower_inj',
             'correction_factor_1', 'correction_factor_2',
             'correction_factor_3', 'correction_factor_inj'],
            axis=1, inplace=True)

    if write_maternal_denom:
        df = df[df['bundle_id'] == 1010]
    # drop the non maternal data pronto
    if adjust_maternal_denom:
        # GET MATERNAL CAUSES
        causes = get_cause_metadata(cause_set_id=9)
        condition = causes.path_to_top_parent.str.contains("366")  # 366 happens
        # to always be in the third level
        # subset just causes that meet the condition sdf
        maternal_causes = causes[condition]
        # make list of maternal causes
        maternal_list = list(maternal_causes['cause_id'].unique())
        # get bundle to cause map
        bundle_cause = query("QUERY")
        # merge cause_id onto data
        df = df.merge(bundle_cause, how='left', on='bundle_id')
        # keep only maternal causes
        df = df[df['cause_id'].isin(maternal_list)]
        # drop cause_id
        df.drop('cause_id', axis=1, inplace=True)
        # drop the denominator bundle
        df = df[df['bundle_id'] != 1010]
    # pull age_group to age_start/age_end map

    ##############################
    # GET POPULATION
    ##############################
    age_group = query("QUERY")
    # correct age groups
    age_group.loc[age_group['age_group_years_end'] > 1, 'age_group_years_end']\
    = age_group.\
    loc[age_group['age_group_years_end'] > 1, 'age_group_years_end'] - 1

    # df of unique ages from hospital data to merge onto age_group map
    df_ages = pd.DataFrame([df.age_start.unique(), df.age_end.unique()]).transpose()
    df_ages.columns = ['age_group_years_start', 'age_group_years_end']
    df_ages = df_ages.merge(age_group, how='left', on=['age_group_years_start',
                                                       'age_group_years_end'])

    # this is the correct terminal age group (even though we use max age = 99)
    df_ages.loc[df_ages['age_group_years_start'] == 95, 'age_group_id'] = 235

    # there are two age_group_ids for age_start=0 and age_start=1
    df_ages = df_ages[df_ages.age_group_id != 161]

    # create age/year/location lists to use for pulling population
    age_list = list(df_ages.age_group_id.unique())
    loc_list = list(df.location_id.unique())
    year_list = list(df.year_start.unique())

    # pull population and merge on age_start and age_end
    pop = get_population(age_group_id=age_list, location_id=loc_list,
                         sex_id=[1, 2], year_id=year_list)

    # attach age_start and age_end to population information
    pop = pop.merge(age_group, how='left', on='age_group_id')
    pop.drop(['process_version_map_id', 'age_group_id'], axis=1, inplace=True)

    # rename pop columns to match hospital data columns
    pop.rename(columns={'age_group_years_start': 'age_start',
                        'age_group_years_end': 'age_end',
                        'year_id': 'year_start'}, inplace=True)
    pop['year_end'] = pop['year_start']

    # correct terminal age group to match our data
    pop.loc[pop['age_end'] == 124, 'age_end'] = 99

    demography = ['location_id', 'year_start', 'year_end', 'age_start',
                  'age_end', 'sex_id']

    ##############################
    # MAKE DATA SQUARE
    ##############################

    # create a series of sorted, non-zero mean values to make sure
    # the func doesn't alter anything
    check_mean = df.loc[df['mean_0'] > 0, 'mean_0'].sort_values().\
        reset_index(drop=True)

    print("Starting number of rows: {}".format(df.shape[0]))
    # square the dataset
    df = hosp_prep.make_zeroes(df, level_of_analysis='bundle_id',
            cols_to_square=['mean_0', 'upper_0', 'lower_0'],
            icd_len=5)
    # assert the sorted means are identical
    assert (check_mean == df.loc[df['mean_0'] > 0, 'mean_0'].sort_values().\
        reset_index(drop=True)).all()


    # delete rows where restrictions should be applied
    # create df where baby sequela are missing
    missing_nfc = df[df.nonfatal_cause_name.isnull()].copy()
    df = df[df.nonfatal_cause_name.notnull()]
    for col in ['mean_0', 'upper_0', 'lower_0']:
        df = hosp_prep.apply_restrictions(df, col)
        missing_nfc = hosp_prep.apply_bundle_restrictions(missing_nfc, col)
        df = df[df[col].notnull()]

    df = pd.concat([df, missing_nfc])
    print("Square number of rows: {}".format(df.shape[0]))
    # don't create the parent injury dupes until after the data is totally
    # square so that the denominators will match
    # df = get_parent_injuries(df)

    # check_parent_injuries(df, 'mean_0')

    pre_shape = df.shape[0]  # store for before comparison
    # then merge population onto the hospital data

    df = df.merge(pop, how='left', on=demography)  # attach pop info to hosp
    assert pre_shape == df.shape[0], "number of rows don't match after merge"

    ##############################
    # RATE SPACE TO COUNTS
    ##############################

    # go from rate space to additive counts
    # because of careful merging we just need to multiply across columns
    df['hosp_count'] = df['mean_0'] * df['population']
    df['upper_hosp_count'] = df['upper_0'] * df['population']
    df['lower_hosp_count'] = df['lower_0'] * df['population']

    # merge on "denominator" from file that was made back in
    # create_cause_fractions. This adds the denominator column, which is the
    # number of admissions in a demographic group
    df = df.merge(pd.read_csv("FILEPATH"),
                  how='left', on=["age_start", "age_end", "sex_id",
                                  "year_start", "year_end", "location_id"])


    # 5 year bins
    df = hosp_prep.year_binner(df)

    # add 5 year NIDs onto data
    df = hosp_prep.five_year_nids(df)


    ##################################################################
    ##################################################################
    #                         THE COLLAPSE                           #
    ##################################################################
    ##################################################################
    # The final collapse to 5 year bands, 5 year nids and bundle ID
    # this is doing a few things at once.  One is that we need to aggregate
    # to 5 year bands.  Another is aggregating to the Bundle_id level of
    # analysis.  Up to this point we were at the nonfatal_cause_name AKA
    # baby sequelae level of analysis.

    # WE NEED TWO COLLAPSES. One for data that doesn't have a sample_size value,
    # and annother for the data that does.  This is because:
    # https://goo.gl/e66OZ4 and https://goo.gl/Fb78xi

    # make df of data where there is full coverage (i.e., the UK)
    full_coverage_sources = ["UK_HOSPITAL_STATISTICS"]

    # make condition mask that indicates rows that have full coverage
    has_full_coverage = df.source.isin(full_coverage_sources)

    covered_df = df[has_full_coverage]

    # drop "denominator" from covered_df
    covered_df = covered_df.drop("denominator", axis=1)

    # drop this data from the main dataframe
    df = df[~has_full_coverage]

    assert (df.loc[df.denominator.isnull(), 'mean_0'] == 0).all(), ("mean_0"
        " should be 0")
    assert (df.loc[df.denominator.isnull(), 'lower_0'] == 0).all(), ("lower_0"
        " should be 0")
    assert (df.loc[df.denominator.isnull(), 'upper_0'] == 0).all(), ("upper_0"
        " should be 0")

    df = df[df.denominator.notnull()]

    # df already has sample size
    df.drop("sample_size", axis=1, inplace=True)
    # check if cases are lost in the groupby

    # rename "denominator" to "sample_size" in df (not covered_df)
    df.rename(columns={"denominator": "sample_size"}, inplace=True)

    pre_cases = df['hosp_count'].sum()
    # can use the same group columns for both dataframes
    groups = ['location_id', 'year_start', 'year_end',
              'age_start', 'age_end', 'sex_id', 'nid',
              'representative_id',
              'bundle_id']

    # sample_size has some null values from being made square, but it was just
    # population, so we're using pop instead. so remember,
    # population == sample_size for covered_df
    covered_df = covered_df.groupby(groups)\
        .agg({'hosp_count':'sum',
              'population': 'sum'}).reset_index()


    # add "sample_size" to the aggregate function
    df = df.groupby(groups).agg({'hosp_count': 'sum',
                                 'upper_hosp_count': 'sum',
                                 'lower_hosp_count': 'sum',
                                 'population': 'sum',
                                 'sample_size': 'sum'}).reset_index()
    assert round(pre_cases, 0) == round(df['hosp_count'].sum(), 0),\
        ("some cases were lost. "
         "From {} to {}".format(pre_cases, df['hosp_count'].sum()))

    # set sample size to np.nan when mean/upper/lower are greater than 0
    df.loc[(df['hosp_count'] > 0) & (df['lower_hosp_count'] > 0) & (df['upper_hosp_count'] > 0), 'sample_size'] = np.nan
    ##############################
    # COUNTS TO RATE SPACE
    ##############################
    # REMAKE mean and uncertainty
    # for the main df:

    df['mean_0'] = df['hosp_count'] / df['population']
    df['lower_0'] = df['lower_hosp_count'] / df['population']
    df['upper_0'] = df['upper_hosp_count'] / df['population']
    df.drop(['hosp_count', 'lower_hosp_count', 'upper_hosp_count'], axis=1,
            inplace=True)

    # add parent injuries
    # NOTE get parent injuries is ran before covered_df is concated with df.
    # it happens to not make a difference at the moment, because there are not
    # injuries in covered_df, but it could in the future.
    df = get_parent_injuries(df)
    for col in ['mean_0', 'lower_0', 'upper_0']:
        hosp_prep.check_parent_injuries(df, col_to_sum=col)

    # this drops the population that was merged on for coverting to counts.
    df.drop('population', axis=1, inplace=True)  # don't need pop anymore

    # for the covered df:
    covered_df['mean_0'] = covered_df['hosp_count'] / covered_df['population']

    covered_df.rename(columns={"population": "sample_size"}, inplace=True)

    # drop columns
    covered_df.drop(['hosp_count'], axis=1, inplace=True)

    ###############################
    # RE-ATTACH
    ###############################
    # bring covered_df and df together.
    # where we have full coverage, lower and upper should be null
    # mean_0 will never be null

    df = pd.concat([df, covered_df], ignore_index=True)

    # assert what we just said will be true in the comments above:
    assert df.loc[has_full_coverage, 'lower_0'].isnull().all(), ("where we have"
        " full coverage, lower_0 should be null")
    assert df.loc[has_full_coverage, 'upper_0'].isnull().all(), ("where we have"
        " full coverage, upper_0 should be null")
    assert df.mean_0.notnull().all(), ("mean_0 should never be null")

    # NOTE, remember, sample size will still have null values, and that's okay
    # we need to keep sample size from here on out.

    if "population" in df.columns:
        print "population was still in columns"
        df.drop("population", axis=1, inplace=True)


    ########################################

    # map measure onto data, just need this for ELMO reqs.
    clean_maps = pd.read_csv("FILEPATH")
    clean_maps = clean_maps[['bundle_id', 'bid_measure']]
    clean_maps.drop_duplicates(inplace=True)
    clean_maps.rename(columns={'bid_measure': 'measure'}, inplace=True)
    # remove null bundles from map
    clean_maps = clean_maps[clean_maps.bundle_id.notnull()]

    pre_shape = df.shape[0]  # store for comparison after merge

    # merge measure onto hosp data using bundle_id
    df = df.merge(clean_maps, how='left', on='bundle_id')
    assert pre_shape == df.shape[0], "number of rows don't match after merge."

    # get injuries bids so we can check for missing measures
    pc_injuries = pd.read_csv("FILEPATH")
    inj_bids = pc_injuries['Level1-Bundle ID'].unique()

    # some injuries bids didn't get measures!
    assert set(df[df.measure.isnull()].bundle_id).issubset(set(inj_bids)),\
        ("We expect that all null measures belong to injuries, but that is"
         "not the case. Something went wrong!")

    # fix any injuries that are missing measure, all inj are inc:
    df.loc[(df.measure.isnull())&(df.bundle_id.isin(inj_bids)), 'measure'] = 'inc'

    assert df.measure.isnull().sum() == 0, ("There are null values and we "
        "expect none")

    # read in correction factors (again)
    correction_factors = pd.read_csv("FILEPATH")

    correction_factors.drop("outpatient", axis=1, inplace=True)

    # rename columns to match df
    correction_factors.rename(columns={'sex': 'sex_id'}, inplace=True)

    # merge corr factors onto data
    df = df.merge(correction_factors, how='left', on=['age_start', 'sex_id',
                  'bundle_id'])
    assert pre_shape == df.shape[0] , ("You unexpectedly added rows while "
        "merging on the correction factors. Don't do that!")

    # if a Bundle ID doesn't have a corr factor from marketscan use 1
    # df.update(df[['a','b','c']].fillna(0))  # test code
    # http://stackoverflow.com/questions/36556256/how-do-i-fill-na-values-in-multiple-columns-in-pandas
    df.update(df[['indv_cf', 'incidence', 'prevalence', 'injury_cf']].fillna(1))

    # rename correction factors to match what we told people they would be
    df.rename(columns={'indv_cf': 'correction_factor_1',
                       'incidence': 'correction_factor_2',
                       'prevalence': 'correction_factor_3',
                       'injury_cf': 'correction_factor_inj'},
                       inplace=True)

    # NOTE we apply every correction factor to all data, even if it is not
    # relevant.  E.g., not all data is injuries, so not all data needs
    # correction_factor_inj.  It simply easier to apply all of them, and then
    # while writing to modeler's folders, drop the irrelevant columns.

    # make mean_1, lower_1, upper_1
    df['mean_1'] = df.correction_factor_1 * df.mean_0
    df['lower_1'] = df.correction_factor_1 * df.lower_0
    df['upper_1'] = df.correction_factor_1 * df.upper_0

    # make mean_2, lower_2, upper_2
    df['mean_2'] = df.correction_factor_2 * df.mean_0
    df['lower_2'] = df.correction_factor_2 * df.lower_0
    df['upper_2'] = df.correction_factor_2 * df.upper_0

    # make mean_3, lower_3, upper_3
    df['mean_3'] = df.correction_factor_3 * df.mean_0
    df['lower_3'] = df.correction_factor_3 * df.lower_0
    df['upper_3'] = df.correction_factor_3 * df.upper_0

    # make injury mean, lower, upper
    df['mean_inj'] = df.correction_factor_inj * df.mean_0
    df['lower_inj'] = df.correction_factor_inj * df.lower_0
    df['upper_inj'] = df.correction_factor_inj * df.upper_0

    # assert what we just said will be true in the comments above:
    levels=["1", "2", "3", "inj"]
    for level in levels:
        assert df.loc[has_full_coverage, 'lower_{}'.format(level)].isnull().all(), ("broke on level {}".format(level))
        assert df.loc[has_full_coverage, 'upper_{}'.format(level)].isnull().all(), ("broke on level {}".format(level))
        assert df["mean_{}".format(level)].notnull().all(), ("broke on level {}".format(level))

    def factor_applier(df, levels=["1", "2", "3", "inj"]):
        for level in levels:
            # initialize cols
            df['test_mean_' + level] = np.nan
            df['test_lower_' + level] = np.nan
            df['test_upper_' + level] = np.nan
            # apply corr factors
            df['test_mean_' + level], df['test_lower_' + level], df['test_upper_' + level] = df["correction_factor_" + level] * df['mean_0'], df["correction_factor_" + level] * df['lower_0'], df["correction_factor_" + level] * df['upper_0']
        return(df)
    df = factor_applier(df)

    levels=["1", "2", "3", "inj"]
    for level in levels:
        assert (df.loc[df["mean_" + level].notnull(), "mean_" + level] == df.loc[df["test_mean_" + level].notnull(), "test_mean_" + level]).all(), ("different on level {}".format(level))
        assert (df.loc[df["upper_" + level].notnull(), "upper_" + level] == df.loc[df["test_upper_" + level].notnull(), "test_upper_" + level]).all(), ("different on level {}".format(level))
        assert (df.loc[df["lower_" + level].notnull(), "lower_" + level] == df.loc[df["test_lower_" + level].notnull(), "test_lower_" + level]).all(), ("different on level {}".format(level))
    # drop test cols for now until we run this for awhile without
    # tripping the assert
    test_cols = df.columns[df.columns.str.startswith("test_")]
    df.drop(test_cols, axis=1, inplace=True)

    # RULE = if correction factor is greater than 50, make the data null
    # EXCEPTIONS are made for these bundles, which are capped at 100:
        # Preterm: 80, 81, 82, 500
        # Encephalopathy: 338
        # Sepsis: 92
        # Hemoloytic: 458
        # PAD/PUD: 345
        # Cirrhosis: 131

    # list of bundles which can have correction factors above 50
    cf_exceptions = [345, 80, 81, 82, 500, 338, 92, 458, 131]

    # NOTE when checking the number of nulls, consider the nulls that are caused
    # by the sample_size split

    def mean_capper(df, exceptions, levels=["1", "2", "3"]):
        exception_condition = df.bundle_id.isin(cf_exceptions)
        for level in levels:
            df.loc[(~exception_condition) & (df['correction_factor_' + level] > 50), ['mean_' + level, 'lower_' + level, 'upper_' + level, ]] = np.nan
            df.loc[(exception_condition) & (df['correction_factor_' + level] > 100), ['mean_' + level, 'lower_' + level, 'upper_' + level, ]] = np.nan

        return(df)
    # create df to test function
    df_test_capper = mean_capper(df, cf_exceptions)


    exception_condition = df.bundle_id.isin(cf_exceptions)  # make boolean mask that says if a bundle is in the list

    df.loc[(~exception_condition)&(df.correction_factor_1 > 50), ['mean_1', 'lower_1', 'upper_1']] = [np.nan, np.nan, np.nan]  # DOESN'T DO ANYTHING don't really need to apply here
    df.loc[(~exception_condition)&(df.correction_factor_2 > 50), ['mean_2', 'lower_2', 'upper_2']] = [np.nan, np.nan, np.nan]  # DID DO SOMETHING, affects 6% of prev rows, 0.01% of inc rows
    df.loc[(~exception_condition)&(df.correction_factor_3 > 50), ['mean_3', 'lower_3', 'upper_3']] = [np.nan, np.nan, np.nan]  # DOES A LOT, affects 57% percent of prevalence rows

    df.loc[(exception_condition)&(df.correction_factor_1 > 100), ['mean_1', 'lower_1', 'upper_1']] = [np.nan, np.nan, np.nan]
    df.loc[(exception_condition)&(df.correction_factor_2 > 100), ['mean_2', 'lower_2', 'upper_2']] = [np.nan, np.nan, np.nan]
    df.loc[(exception_condition)&(df.correction_factor_3 > 100), ['mean_3', 'lower_3', 'upper_3']] = [np.nan, np.nan, np.nan]


    #####################################################
    # CHECK that lower < mean < upper
    #####################################################
    # loop over every level of correction
    # can't compare null values, null comparisons always eval to False
    for i in ["0", "1", "2", "3", 'inj']:
        # lower < mean
        assert (df.loc[df['lower_'+i].notnull(), 'lower_'+i] <=
                df.loc[df["lower_"+i].notnull(), 'mean_'+i]).all(),\
            "lower_{} should be less than mean_{}".format(i, i)
        # mean < lower
        assert (df.loc[df["upper_"+i].notnull(), 'mean_'+i] <=
                df.loc[df["upper_"+i].notnull(), 'upper_'+i]).all(),\
            "mean_{} should be less than upper_{}".format(i, i)

    # compare the results between test df and proper df
    for uncertainty in ["mean", "upper", "lower"]:
        for level in ["1", "2", "3"]:
            # compare the sum of nulls rows between dfs
            assert df[uncertainty + "_" + level].isnull().sum() ==\
                df_test_capper[uncertainty + "_" + level].isnull().sum(),\
                "The new capping function is producing different results"

    # write the maternal denominator data, this is for the future when we work
    # in parallel
    if write_maternal_denom:
        def write_maternal_denom(df):
            mat_df = df[df.bundle_id==1010].copy()
            mat_df = mat_df.query("sex_id == 2 & age_start >=10 & age_end <=54")

            if mat_df.shape[0] == 0:
                return
            # NOTE sample size is dropped here, and we make a new one in the
            # following code
            mat_df = mat_df[['location_id', 'year_start', 'year_end',
                             'age_start', 'age_end', 'sex_id',
                             'mean_0', 'mean_1', 'mean_2', 'mean_3']].copy()

            bounds = ['upper_0', 'upper_1', 'upper_2', 'upper_3',
                        'lower_0', 'lower_1', 'lower_2', 'lower_3']
            for uncertainty in bounds:
                mat_df[uncertainty] = np.nan

            # PREP FOR POP ####################################################
            # we don't have years that we can merge on pop to yet, because
            # we aggregated to year bands; another problem with this method
            mat_df['year_id'] = mat_df.year_start + 2  # makes 2000,2005,2010

            # bunch of age things so we can use age_group_id to get pop
            age_group = query("QUERY")
            # correct age groups
            age_group.loc[age_group['age_group_years_end'] > 1,
                          'age_group_years_end'] =\
                age_group.loc[age_group['age_group_years_end'] > 1,
                              'age_group_years_end'] - 1

            # df of unique ages from hospital data to merge onto age_group map
            mat_df_ages = pd.DataFrame([mat_df.age_start.unique(),
                                       mat_df.age_end.unique()]).transpose()
            mat_df_ages.columns = ['age_group_years_start',
                                   'age_group_years_end']
            mat_df_ages = mat_df_ages.merge(age_group, how='left',
                                            on=['age_group_years_start',
                                                'age_group_years_end'])

            # this is the correct terminal age group (even though we use max
            # age = 99)
            mat_df_ages.loc[mat_df_ages['age_group_years_start'] == 95,
                            'age_group_id'] = 235

            # there are two age_group_ids for age_start=0 and age_start=1
            mat_df_ages = mat_df_ages[mat_df_ages.age_group_id != 161]

            # create age/year/location lists to use for pulling population
            age_list = list(mat_df_ages.age_group_id.unique())
            loc_list = list(mat_df.location_id.unique())
            year_list = list(mat_df.year_id.unique())

            # GET POP ########################################################
            # pull population and merge on age_start and age_end
            pop = get_population(age_group_id=age_list, location_id=loc_list,
                                 sex_id=[1, 2], year_id=year_list)

            # FORMAT POP ####################################################
            # attach age_start and age_end to population information
            pop = pop.merge(age_group, how='left', on='age_group_id')
            pop.drop(['process_version_map_id', 'age_group_id'], axis=1,
                     inplace=True)

            # rename pop columns to match hospital data columns
            pop.rename(columns={'age_group_years_start': 'age_start',
                                'age_group_years_end': 'age_end'}, inplace=True)

            # correct terminal age group to match our data
            pop.loc[pop['age_end'] == 124, 'age_end'] = 99

            # MERGE POP ######################################################
            demography = ['location_id', 'year_id', 'age_start',
                          'age_end', 'sex_id']

            pre_shape = mat_df.shape[0]  # store for before comparison
            # then merge population onto the hospital data

            # attach pop info to hosp
            mat_df = mat_df.merge(pop, how='left', on=demography)
            assert pre_shape == mat_df.shape[0], ("number of rows don't "
                "match after merge")


            # MAKE SAMPLE SIZE  ##############################################
            mat_df['sample_size'] = mat_df.population * mat_df.mean_0

            # DROP intermidiate columns
            mat_df.drop(['population', 'year_id'], axis=1, inplace=True)


            mat_df.to_hdf("FILEPATH", key='df', mode="w")
            # backup copy to _archive
            mat_df.to_hdf("FILEPATH", key='df', mode='w')
        write_maternal_denom(df)


    if adjust_maternal_denom:
        def adjust_maternal_denom(df):

            # drop sample_size, UTLAs already had it, but we need it for
            # everything, so we have to drop it.
            df.drop('sample_size', axis=1, inplace=True)
            df = df.query("sex_id == 2 & age_start >=10 & age_end <=54")

            # read in maternal denoms, this is needed when our process is
            # parallelized
            denom = pd.read_hdf("FILEPATH", key="df")
            # denom.drop('bundle_id', axis=1, inplace=True)
            denom_cols = sorted(denom.filter(regex="[0-9]$").columns)
            for col in denom_cols:
                denom.rename(columns={col: col + "_denominator"}, inplace=True)
            pre = df.shape[0]
            df = df.merge(denom, how='left', on=['location_id', 'year_start',
                                                 'year_end', 'age_start',
                                                 'age_end', 'sex_id'])
            assert pre == df.shape[0], ("shape should not have changed "
                "during merge")
            #print(df[df.mean_0_denominator.isnull()].shape)
            #print(df[df.mean_0_denominator.isnull()])
            df = df[(df['mean_0'] > 0) | (df['mean_0_denominator'].notnull())]

            assert df.mean_0_denominator.isnull().sum() == 0, ("shouldn't be "
                "any null values in this column")
            # regex to find the columns that start with l, m or u and end with
            #  a digit
            num_cols = sorted(df.filter(regex="^[lmu].*[0-9]$").columns)
            denom_cols =\
                sorted(df.columns[df.columns.str.endswith("denominator")])

            # divide each bundle value by bundle 1010 to get the adjusted rate
            for i in np.arange(0, 12, 1):
                df[num_cols[i]] = df[num_cols[i]] / df[denom_cols[i]]
            # drop the denominator columns
            df.drop(denom_cols, axis=1, inplace=True)
            # can't divide by zero
            df = df[df['sample_size'] != 0]
            # RETURN ONLY THE MATERNAL DATA
            return(df)
        df = adjust_maternal_denom(df)

    return(df)
Esempio n. 16
0
            '../save_custom_results.py "{}" "{}" "{}" "{}" "{}" "{}" "{}" "{}"'
            .format(process_vers,
                    'FILEPATH{}/single_year/draws'.format(process_vers),
                    'draws_{year_id}_366.h5', 1990, 2019, 'sdg',
                    'MMR {}'.format(process_vers), 4))
    subprocess.call(call, shell=True)


if __name__ == '__main__':

    print("Initiating script.")
    decomp_step, gbd_round_id, conn_def = sys.argv[1:4]
    gbd_round_id = int(gbd_round_id)

    print("Getting cause metdata")
    cause_df = get_cause_metadata(8, gbd_round_id=gbd_round_id)

    print("Getting causes")
    # only most-detailed and root cause
    causes = cause_df.loc[(cause_df.most_detailed == 1) |
                          (cause_df.level == 0)].cause_id.unique().tolist()
    codcorrect_vers = get_best_codcorrect_vers(decomp_step, gbd_round_id)

    print("setting process version")
    process_vers = Uploader(conn_def, codcorrect_vers, decomp_step,
                            int(gbd_round_id)).prep_upload()
    process_vers = 14774
    mmr_out_dir, arc_out_dir = set_out_dirs(process_vers)

    print("Launching save_birth_estimates")
    launch_save_birth_estimate_job(gbd_round_id, decomp_step, process_vers)
Esempio n. 17
0

# Data from XX, XX, and XX were ICD-10 coded,
# data from XX and XX were ICD-9 coded and
# data from XX contained both ICD-9 and ICD-10 coded deaths
df.groupby("location_name", as_index=False).agg(
    {"code_system_id": "unique"}).to_csv(
    "/home/j/temp/agesak/thesis/tables/icd_systems.csv", index=False)

# number of injuries related deaths - need total # deaths for each source
# df.groupby("location_name", as_index=False).agg({"deaths": "sum"})

# Deaths where an injuries-related ICD code was the
# underlying cause of death were mapped to one of XX
# most-detailed GBD injuries causes.
causes = get_cause_metadata(gbd_round_id=6, cause_set_id=3)
injuries = causes.loc[causes.acause.str.contains("inj")]
len(injuries.query("most_detailed==1"))

# Sentence: Of the XX million deaths available in these records,
# XX% were injuries related, with XX% of these injuries deaths being garbage coded.
# get just injuries related deaths.

# Part 1: Of the XX million deaths available in these records,
# could pick any int cause here
df = get_mcause_data(
    phase='format_map', sub_dirs="sepsis",
    source=["TWN_MOH", "MEX_INEGI", "BRA_SIM",
            "USA_NVSS", "COL_DANE", "ITA_ISTAT"],
    verbose=True, **{"force_rerun": True, "block_rerun": False})
# total deaths
Esempio n. 18
0
def _launch_cod_splits(source_cause_id, target_cause_ids, target_meids,
                       prop_meas_id, gbd_round_id, decomp_step, output_dir,
                       project):
    """
    Split the given source_cause_id given target_meid proportions, saved
    to the target_cause_ids in output_dir.

    Arguments:
        source_cause_id (int): cause_id for the draws to be split
        target_cause_ids (intlist): list of cause ids that you want the new
            outputted subcauses to be identified by
        target_meids (intlist): list of proportion models' modelable_entity_ids
            that you want the source_cause_id to be split by, to make the
            target_cause_ids. Target_cause_ids and target_me_ids must be
            specified in the same order
        prop_meas_id (int): The measure_id that identifies the proportion
            in the target_meids to use for the split.
        gbd_round_id (int): the gbd_round_id for models being split.
        decomp_step (str): Specifies which decomposition step the returned
            estimates should be from. If using interpolate for GBD round 6 and
            above, must specify one of 'step1', 'step2', 'step3', 'step4',
            'step5', or 'iterative'.
        output_dir (str): directory where you want final results stored
        project (str): The SGE project to launch split_cod_model subjobs
            to using SplitCodSwarm.

    Returns:
        A list of tuples with each location_id paired with either 0, or an
                error message. This is then parsed in the central function
                draw_ops.split_cod_model into errors or success messages
    """

    # setup years, sex restrictions, most detailed locations, etc.

    if gbd_round_id >= 6:
        cause_set_id = COMPUTATION_CAUSE_SET_ID
    else:
        cause_set_id = REPORTING_CAUSE_SET_ID
    causes = get_cause_metadata(
        cause_set_id=cause_set_id,
        gbd_round_id=gbd_round_id,
        decomp_step=decomp_step).query("cause_id==@source_cause_id")
    sex_ids = []
    if causes['male'].item() != 0:
        sex_ids.append(1)
    if causes['female'].item() != 0:
        sex_ids.append(2)
    if not sex_ids:
        raise ValueError(
            "Source_cause_id {} is restricted for both males and females, "
            "according to cause metadata".format(source_cause_id))

    most_detailed_locs = list(
        get_location_metadata(35,
                              gbd_round_id=gbd_round_id,
                              decomp_step=decomp_step).query(
                                  'most_detailed==1').location_id.unique())
    meid_cause_map = dict(zip(target_meids, target_cause_ids))

    # run interpolating/extrapolating
    intermediate_dir = os.path.join(output_dir,
                                    'intermediate_{}'.format(source_cause_id))
    if not os.path.exists(intermediate_dir):
        makedirs_safely(intermediate_dir)

    swarm = SplitCoDSwarm(source_id=source_cause_id,
                          proportion_ids=target_meids,
                          proportion_measure_id=prop_meas_id,
                          sex_ids=sex_ids,
                          gbd_round_id=gbd_round_id,
                          decomp_step=decomp_step,
                          intermediate_dir=intermediate_dir,
                          outdir=output_dir,
                          project=project)
    swarm.add_interpolate_tasks()
    exit_code = swarm.run()
    if exit_code != 0:
        raise RuntimeError(
            "Interpolating CoD years failed. Check logs in {}.".format(
                output_dir))

    # run splitting
    for cid in target_cause_ids:
        cid_dir = os.path.join(output_dir, str(cid))
        if not os.path.exists(cid_dir):
            makedirs_safely(cid_dir)
    file_list = glob.glob(os.path.join('{}/*.h5'.format(intermediate_dir)))

    # read in draws for source cause
    source = _get_draws(source_cause_id, gbd_round_id, decomp_step, sex_ids)
    # create a temporary directory to store all the draws from the source cause
    tmpdir = tempfile.TemporaryDirectory(dir=output_dir)
    # save source cause draws to temporary directory
    source.to_hdf(
        os.path.join(tmpdir.name, 'source_cause_draws.h5'),
        key='draws',
        mode='w',
        format='table',
        data_columns=['location_id', 'year_id', 'sex_id', 'age_group_id'])

    run_splits = functools.partial(_parallel_merge_split, meid_cause_map,
                                   file_list, output_dir, tmpdir)
    pool = Pool(30)
    res = pool.map(run_splits, most_detailed_locs)
    pool.close()
    pool.join()
    # clean up tempdir
    tmpdir.cleanup()
    return res