コード例 #1
0
def apply_envelope_only(df,
                        env_path,
                        return_only_inj=False,
                        apply_age_sex_restrictions=True,
                        want_to_drop_data=True,
                        create_hosp_denom=False,
                        apply_env_subroutine=False,
                        fix_norway_subnat=False):
    """
    Function that converts hospital data into rates.  Takes hospital data in
    count space, at the baby Sequelae level, computes cause fractions, attaches
    the hospital utilization envelope, and multiplies the envelope and the cause
    fractions.  Data that represents fully covered populations are not made into
    cause fractions, nor is the envelope applied.  Instead, their counts are
    divided by population. Returns a DataFrame that is in Rate spaces at the
    Baby Sequelae level.

    Arguments:
        df (DataFrame) contains hospital inpatient primary diagnosis that
            has been mapped to Baby Sequelae
        return_only_inj (bool):
            switch that will make it so this function only returns injuries
            data. should always be off, it's just there for
            testing/convienience.
        apply_age_sex_restrictions (bool): switch that if True, will apply
            age and sex restrictions as determined by cause_set_id=9.  Defaults
            to true.  Useful if you want to look at data all the data that was
            present in the source.
        want_to_drop_data (bool): If True will run drop_data(verbose=False)
            from the hosp_prep module
        create_hosp_denom (bool): If true create_cause_fraction() will write
            the denominator for use later for imputed zeros.
    """

    ###############################################
    # DROP DATA
    ###############################################
    if want_to_drop_data:
        df = hosp_prep.drop_data(df, verbose=False)

    if not df.age_group_id.notnull().all():
        warnings.warn("""Shouldn't be any null age_group_id, there could be
        unsplit ages in the data, or maybe an age_start/age_end changed""")

    # Make list of sources that we don't want to use envelope.  This should be
    # the source in the 'source' column of the dataframe
    full_coverage_sources = ["UK_HOSPITAL_STATISTICS"]

    if pd.Series(full_coverage_sources).isin(df.source.unique()).any():
        # make dataframe with sources
        full_coverage_df = df[df.source.isin(full_coverage_sources)].copy()

        # get denominator (aka poplation)
        full_coverage_df = gbd_hosp_prep.get_sample_size(full_coverage_df,
                                                         fix_group237=True)

        full_coverage_df.rename(columns={'population': 'sample_size'},
                                inplace=True)

        # make product
        full_coverage_df['product'] =\
            full_coverage_df.val / full_coverage_df.sample_size

    else:
        # make empty dataframe so that it exists and wont break anything
        full_coverage_df = pd.DataFrame(columns=list(df.columns) + ['product'])

    # drop these sources from dataframe for now
    df = df[~df.source.isin(full_coverage_sources)].copy()

    #########################################
    # CREATE CAUSE FRACTIONS
    #########################################
    if not apply_env_subroutine:
        df = hosp_prep.create_cause_fraction(
            df,
            create_hosp_denom=create_hosp_denom,
            store_diagnostics=True,
            tol=1e-6)

    if apply_age_sex_restrictions:
        df = hosp_prep.apply_restrictions(df, 'cause_fraction')
        if full_coverage_df.shape[0] > 0:
            full_coverage_df = hosp_prep.apply_restrictions(
                full_coverage_df, 'product')
        # drop the restricted values
        df = df[df['cause_fraction'].notnull()]
        full_coverage_df =\
            full_coverage_df[full_coverage_df['product'].notnull()]

    if env_path[-3:] == "csv":
        env_df = pd.read_csv(env_path)
    else:
        env_df = pd.read_hdf(env_path, key='df')

    # split the envelope into Norway subnationals
    if fix_norway_subnat:
        # make a df of national and subnational locs
        nor_locs = [
            4910, 4911, 4912, 4913, 4914, 4915, 4916, 4917, 4918, 4919, 4920,
            4921, 4922, 4923, 4926, 4927, 4928, 53432
        ]
        # repeat norway's national location n times
        nat_locs = np.repeat(90, len(nor_locs))
        nor_subnat_df = pd.DataFrame({
            'sub_location_id': nor_locs,
            'location_id': nat_locs
        })

        # merge env on by national loc
        nor_subnat_df = nor_subnat_df.merge(env_df,
                                            how='left',
                                            on='location_id')
        # set subnation loc to proper loc id
        nor_subnat_df.drop('location_id', axis=1, inplace=True)
        nor_subnat_df.rename(columns={'sub_location_id': 'location_id'},
                             inplace=True)
        # append the new locations onto the env
        env_df = pd.concat([env_df, nor_subnat_df], ignore_index=True)
        assert env_df[env_df.location_id.isin(nor_locs)].shape[0] > 5000,\
            "Some of the norway subnational locations {} weren't merged on. {}".\
            format(nor_locs, env_df[env_df.location_id.isin(nor_locs)].location_id.unique())

    env_df.drop(['age_start', 'age_end'], axis=1, inplace=True)

    demography = [
        'location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id'
    ]

    ############################################
    # MERGE ENVELOPE onto data
    ############################################
    pre_shape = df.shape[0]
    df = df.merge(env_df, how='left', on=demography)
    assert pre_shape == df.shape[0],\
        "The merge duplicated rows unexpectedly"

    # compute hospitalization rate
    # aka "apply the envelope"
    df['product'] = df['cause_fraction'] * df['mean']
    df['upper_product'] = df['cause_fraction'] * df['upper']
    df['lower_product'] = df['cause_fraction'] * df['lower']

    ############################################
    # RE-ATTACH data that has sample size
    ############################################

    df = pd.concat([df, full_coverage_df]).reset_index(drop=True)

    # if we just want injuries:
    if return_only_inj == True:
        inj_bundle = pd.read_csv(root + r"FILEPATH"
                                 r"parent_child_injuries_gbd2017.csv")
        inj_bundle = list(inj_bundle['Level1-Bundle ID'])
        df = df[df['bundle_id'].isin(inj_bundle)]

    if apply_env_subroutine:
        df.drop(['cause_fraction', 'mean', 'upper', 'lower'],
                axis=1,
                inplace=True)
        df.rename(columns={
            'product': 'mean',
            'lower_product': 'lower',
            'upper_product': 'upper'
        },
                  inplace=True)
    else:
        df.drop([
            'cause_fraction', 'mean', 'upper', 'lower', 'val', 'numerator',
            'denominator'
        ],
                axis=1,
                inplace=True)

    return (df)
コード例 #2
0
def run_age_sex_splitting(df, split_sources=[], round_id=0, verbose=False):
    """ takes in dataframe of data that you want to split, and a list of sources
    that need splitting, and then uses age_sex_splitting.py to split.
    as of 6/16/2017 this function assumes that the weights you want to use are already
    present. There is some skeletal stuff to deal with multiple rounds for weights
    but it's not currently fleshed out
    """

    # write data used for visualization
    if round_id == 1 or round_id == 2:
        # save pre_split data for viz, attach location name and bundle name and save
        pre_split = df[df.source.isin(split_sources)]
        pre_split = pre_split.merge(query(SQL QUERY),
                                    how='left', on='location_id')
        pre_split.drop(['year_end', 'age_group_unit', 'age_end', 'nid',
                        'facility_id', 'representative_id', 'diagnosis_id',
                        'outcome_id', 'metric_id'], axis=1, inplace=True)
        pre_split.to_csv("FILEPATH", index=False)

    # drop data and apply age/sex restrictions
    df = hosp_prep.drop_data(df, verbose=False)
    df = hosp_prep.apply_restrictions(df, col_to_restrict='val')

    # list of col names to compare for later use
    pre_cols = df.columns
    # we need to fix the terminal ages by source
    lst = []
    for asource in df.source.unique():
        subdf = df[df['source'] == asource].copy()
        old_max_age = subdf.age_end.max()  # store old bad terminal age
        # replace terminal age end so that age_sex_split will recognize this as a "bad" age_group
        subdf.loc[subdf.age_end == old_max_age, 'age_end'] = 124
        lst.append(subdf)
        del subdf
    df = pd.concat(lst)
    del lst

    # make age end match what is in the shared DBB
    df.loc[df['age_end'] > 1, 'age_end'] = df.loc[df['age_end'] > 1, 'age_end'] + 1
    # get age info
    age_group_info = query(SQL QUERY)
    # this age group is wrong.
    age_group_info = age_group_info[age_group_info['age_group_id'] != 161]
    # has same age_start and end as 22, results in duplicates
    age_group_info = age_group_info[age_group_info['age_group_id'] != 27]
    age_group_info.rename(columns={"age_group_years_start": "age_start",
                                   "age_group_years_end": "age_end"},
                                   inplace=True)

    # merge ages on
    pre = df.shape[0]
    df = df.merge(age_group_info, how="left",
                                      on=['age_start', 'age_end'])
    assert pre == df.shape[0],\
    r"the number of rows changed after "
    r"merging on age_group_id, probably because there are "
    r"2 ids that share the same age_start and age_end"
    # drop age start/end for simplicity
    df.drop(['age_start', 'age_end'], axis=1, inplace=True)

    df_list = []  # initialize empty list to concat split dfs to

    # columns that uniquely identify rows of data
    id_cols = ['source', 'nid', 'nonfatal_cause_name',
               'year_id', 'age_group_id', 'sex_id', 'location_id']
    perfect_ages = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
                    18, 19, 20, 28, 30, 31, 32, 235]
    rows = df.shape[0]
    numer = 0
    # for source in split_sources:
    for source in df.source.unique():
        # make df of one source that needs to be split
        splitting_df = df[df.source == source].copy()
        numer += splitting_df.shape[0]
        print(source)
        print("working: {}% done".format(float(numer)/rows * 100))
        if verbose:
            print "{}'s age groups before:".format(source)
            print splitting_df[['age_group_id']].\
                drop_duplicates().sort_values(by='age_group_id')

        # create year_id
        splitting_df['year_id'] = splitting_df['year_start']
        splitting_df.drop(['year_start', 'year_end'], axis=1, inplace=True)
        if verbose:
            print "now splitting {}".format(source)
        # check if data needs to be split, if not append to dflist and move on
        if set(splitting_df.age_group_id).symmetric_difference(set(perfect_ages)) == set():
            df_list.append(splitting_df)
            continue
        # the function from CoD team that does the splitting
        split_df = age_sex_split.split_age_sex(splitting_df,
                                               id_cols,
                                               value_column='val',
                                               level_of_analysis='nonfatal_cause_name',
                                               fix_gbd2016_mistake=False)
        if verbose:
            print(
                "Orig value sum {} - New value sum {} = {} \n".format(
                    splitting_df.val.sum(),
                    split_df.val.sum(),
                    splitting_df.val.sum() - split_df.val.sum()
                )
            )
        pre_val = splitting_df.val.sum()
        post_val = split_df.val.sum()
        assert pre_val - post_val <= (pre_val * .0001),\
        "Too many cases were lost"
        if verbose:
            print "{}'s ages after:".format(source)
            print split_df[['age_group_id']].\
                drop_duplicates().sort_values(by='age_group_id')
        # append split data to our list of dataframes
        df_list.append(split_df)

    # bring the list of split DFs back together
    df = pd.concat(df_list).reset_index(drop=True)

    df = df.merge(age_group_info, how='left', on='age_group_id')

    # change age_end to match the rest of the data
    df.loc[df.age_end > 1, 'age_end'] =\
        df.loc[df.age_end > 1, 'age_end'] - 1

    df.loc[df.age_end == 124, 'age_end'] = 99

    # check that we have the right number of age groups
    assert df[['age_start', 'age_end', 'age_group_id']].drop_duplicates()\
        .shape[0]==21

    # prepare to reattach split data to main df
    df['year_start'] = df['year_id']
    df['year_end'] = df['year_id']
    df.drop(['year_id', 'age_group_id'], axis=1, inplace=True)
    assert set(df.columns).symmetric_difference(set(pre_cols)) == set()

    # data used for visualization
    if round_id == 1 or round_id == 2:
        df = df.merge(query(SQL QUERY))

        # ? df_for_tableau = df.copy()
        df.drop(['year_end', 'age_group_unit', 'age_end', 'nid',
                       'facility_id', 'representative_id', 'diagnosis_id',
                       'outcome_id', 'metric_id'], axis=1).to_csv(
                       "FILEPATH", index=False)

    return(df)
コード例 #3
0
def aggregate_to_bundle(df, write_maternal_denom=False,
                        adjust_maternal_denom=False):
    """
    Takes a dataframe aggregated to baby sequela level and returns
    the df aggregated to 5 year bands at the bundle level
    """
    ##############################
    # PULL POPULATION
    #
    # This is preparation for aggregating to 5 year bands
    ##############################

    # get rid of all corrected values and correction factors. we want to do
    # this because we are going to aggregate to 5 year bands and the bundle_id
    # level of analysis. We intend to makes cells where the correction factor
    # is over 50 NULL. it doesn't make sense to do that before aggregating,
    # because any rows with nulls in them would be lost entirely.  It doesn't
    # make sense to apply corrections and NOT delete cells that break the
    # over-50 rule before aggregation either cuz we would lose the correction
    # factors during the groupby. Also, it would not be robust to aggregate
    # a 16 columns. Therefore, it's best to apply corrections after aggregating.

    # Technically, the groupby would get rid of these for us. Honestly, this
    # just makes things easier to work with up to that point.
    df.drop(['mean_1', 'mean_2', 'mean_3', 'mean_inj',
             'upper_1', 'upper_2', 'upper_3', 'upper_inj',
             'lower_1', 'lower_2', 'lower_3', 'lower_inj',
             'correction_factor_1', 'correction_factor_2',
             'correction_factor_3', 'correction_factor_inj'],
            axis=1, inplace=True)

    if write_maternal_denom:
        df = df[df['bundle_id'] == 1010]
    # drop the non maternal data pronto
    if adjust_maternal_denom:
        # GET MATERNAL CAUSES
        causes = get_cause_metadata(cause_set_id=9)
        condition = causes.path_to_top_parent.str.contains("366")  # 366 happens
        # to always be in the third level
        # subset just causes that meet the condition sdf
        maternal_causes = causes[condition]
        # make list of maternal causes
        maternal_list = list(maternal_causes['cause_id'].unique())
        # get bundle to cause map
        bundle_cause = query("QUERY")
        # merge cause_id onto data
        df = df.merge(bundle_cause, how='left', on='bundle_id')
        # keep only maternal causes
        df = df[df['cause_id'].isin(maternal_list)]
        # drop cause_id
        df.drop('cause_id', axis=1, inplace=True)
        # drop the denominator bundle
        df = df[df['bundle_id'] != 1010]
    # pull age_group to age_start/age_end map

    ##############################
    # GET POPULATION
    ##############################
    age_group = query("QUERY")
    # correct age groups
    age_group.loc[age_group['age_group_years_end'] > 1, 'age_group_years_end']\
    = age_group.\
    loc[age_group['age_group_years_end'] > 1, 'age_group_years_end'] - 1

    # df of unique ages from hospital data to merge onto age_group map
    df_ages = pd.DataFrame([df.age_start.unique(), df.age_end.unique()]).transpose()
    df_ages.columns = ['age_group_years_start', 'age_group_years_end']
    df_ages = df_ages.merge(age_group, how='left', on=['age_group_years_start',
                                                       'age_group_years_end'])

    # this is the correct terminal age group (even though we use max age = 99)
    df_ages.loc[df_ages['age_group_years_start'] == 95, 'age_group_id'] = 235

    # there are two age_group_ids for age_start=0 and age_start=1
    df_ages = df_ages[df_ages.age_group_id != 161]

    # create age/year/location lists to use for pulling population
    age_list = list(df_ages.age_group_id.unique())
    loc_list = list(df.location_id.unique())
    year_list = list(df.year_start.unique())

    # pull population and merge on age_start and age_end
    pop = get_population(age_group_id=age_list, location_id=loc_list,
                         sex_id=[1, 2], year_id=year_list)

    # attach age_start and age_end to population information
    pop = pop.merge(age_group, how='left', on='age_group_id')
    pop.drop(['process_version_map_id', 'age_group_id'], axis=1, inplace=True)

    # rename pop columns to match hospital data columns
    pop.rename(columns={'age_group_years_start': 'age_start',
                        'age_group_years_end': 'age_end',
                        'year_id': 'year_start'}, inplace=True)
    pop['year_end'] = pop['year_start']

    # correct terminal age group to match our data
    pop.loc[pop['age_end'] == 124, 'age_end'] = 99

    demography = ['location_id', 'year_start', 'year_end', 'age_start',
                  'age_end', 'sex_id']

    ##############################
    # MAKE DATA SQUARE
    ##############################

    # create a series of sorted, non-zero mean values to make sure
    # the func doesn't alter anything
    check_mean = df.loc[df['mean_0'] > 0, 'mean_0'].sort_values().\
        reset_index(drop=True)

    print("Starting number of rows: {}".format(df.shape[0]))
    # square the dataset
    df = hosp_prep.make_zeroes(df, level_of_analysis='bundle_id',
            cols_to_square=['mean_0', 'upper_0', 'lower_0'],
            icd_len=5)
    # assert the sorted means are identical
    assert (check_mean == df.loc[df['mean_0'] > 0, 'mean_0'].sort_values().\
        reset_index(drop=True)).all()


    # delete rows where restrictions should be applied
    # create df where baby sequela are missing
    missing_nfc = df[df.nonfatal_cause_name.isnull()].copy()
    df = df[df.nonfatal_cause_name.notnull()]
    for col in ['mean_0', 'upper_0', 'lower_0']:
        df = hosp_prep.apply_restrictions(df, col)
        missing_nfc = hosp_prep.apply_bundle_restrictions(missing_nfc, col)
        df = df[df[col].notnull()]

    df = pd.concat([df, missing_nfc])
    print("Square number of rows: {}".format(df.shape[0]))
    # don't create the parent injury dupes until after the data is totally
    # square so that the denominators will match
    # df = get_parent_injuries(df)

    # check_parent_injuries(df, 'mean_0')

    pre_shape = df.shape[0]  # store for before comparison
    # then merge population onto the hospital data

    df = df.merge(pop, how='left', on=demography)  # attach pop info to hosp
    assert pre_shape == df.shape[0], "number of rows don't match after merge"

    ##############################
    # RATE SPACE TO COUNTS
    ##############################

    # go from rate space to additive counts
    # because of careful merging we just need to multiply across columns
    df['hosp_count'] = df['mean_0'] * df['population']
    df['upper_hosp_count'] = df['upper_0'] * df['population']
    df['lower_hosp_count'] = df['lower_0'] * df['population']

    # merge on "denominator" from file that was made back in
    # create_cause_fractions. This adds the denominator column, which is the
    # number of admissions in a demographic group
    df = df.merge(pd.read_csv("FILEPATH"),
                  how='left', on=["age_start", "age_end", "sex_id",
                                  "year_start", "year_end", "location_id"])


    # 5 year bins
    df = hosp_prep.year_binner(df)

    # add 5 year NIDs onto data
    df = hosp_prep.five_year_nids(df)


    ##################################################################
    ##################################################################
    #                         THE COLLAPSE                           #
    ##################################################################
    ##################################################################
    # The final collapse to 5 year bands, 5 year nids and bundle ID
    # this is doing a few things at once.  One is that we need to aggregate
    # to 5 year bands.  Another is aggregating to the Bundle_id level of
    # analysis.  Up to this point we were at the nonfatal_cause_name AKA
    # baby sequelae level of analysis.

    # WE NEED TWO COLLAPSES. One for data that doesn't have a sample_size value,
    # and annother for the data that does.  This is because:
    # https://goo.gl/e66OZ4 and https://goo.gl/Fb78xi

    # make df of data where there is full coverage (i.e., the UK)
    full_coverage_sources = ["UK_HOSPITAL_STATISTICS"]

    # make condition mask that indicates rows that have full coverage
    has_full_coverage = df.source.isin(full_coverage_sources)

    covered_df = df[has_full_coverage]

    # drop "denominator" from covered_df
    covered_df = covered_df.drop("denominator", axis=1)

    # drop this data from the main dataframe
    df = df[~has_full_coverage]

    assert (df.loc[df.denominator.isnull(), 'mean_0'] == 0).all(), ("mean_0"
        " should be 0")
    assert (df.loc[df.denominator.isnull(), 'lower_0'] == 0).all(), ("lower_0"
        " should be 0")
    assert (df.loc[df.denominator.isnull(), 'upper_0'] == 0).all(), ("upper_0"
        " should be 0")

    df = df[df.denominator.notnull()]

    # df already has sample size
    df.drop("sample_size", axis=1, inplace=True)
    # check if cases are lost in the groupby

    # rename "denominator" to "sample_size" in df (not covered_df)
    df.rename(columns={"denominator": "sample_size"}, inplace=True)

    pre_cases = df['hosp_count'].sum()
    # can use the same group columns for both dataframes
    groups = ['location_id', 'year_start', 'year_end',
              'age_start', 'age_end', 'sex_id', 'nid',
              'representative_id',
              'bundle_id']

    # sample_size has some null values from being made square, but it was just
    # population, so we're using pop instead. so remember,
    # population == sample_size for covered_df
    covered_df = covered_df.groupby(groups)\
        .agg({'hosp_count':'sum',
              'population': 'sum'}).reset_index()


    # add "sample_size" to the aggregate function
    df = df.groupby(groups).agg({'hosp_count': 'sum',
                                 'upper_hosp_count': 'sum',
                                 'lower_hosp_count': 'sum',
                                 'population': 'sum',
                                 'sample_size': 'sum'}).reset_index()
    assert round(pre_cases, 0) == round(df['hosp_count'].sum(), 0),\
        ("some cases were lost. "
         "From {} to {}".format(pre_cases, df['hosp_count'].sum()))

    # set sample size to np.nan when mean/upper/lower are greater than 0
    df.loc[(df['hosp_count'] > 0) & (df['lower_hosp_count'] > 0) & (df['upper_hosp_count'] > 0), 'sample_size'] = np.nan
    ##############################
    # COUNTS TO RATE SPACE
    ##############################
    # REMAKE mean and uncertainty
    # for the main df:

    df['mean_0'] = df['hosp_count'] / df['population']
    df['lower_0'] = df['lower_hosp_count'] / df['population']
    df['upper_0'] = df['upper_hosp_count'] / df['population']
    df.drop(['hosp_count', 'lower_hosp_count', 'upper_hosp_count'], axis=1,
            inplace=True)

    # add parent injuries
    # NOTE get parent injuries is ran before covered_df is concated with df.
    # it happens to not make a difference at the moment, because there are not
    # injuries in covered_df, but it could in the future.
    df = get_parent_injuries(df)
    for col in ['mean_0', 'lower_0', 'upper_0']:
        hosp_prep.check_parent_injuries(df, col_to_sum=col)

    # this drops the population that was merged on for coverting to counts.
    df.drop('population', axis=1, inplace=True)  # don't need pop anymore

    # for the covered df:
    covered_df['mean_0'] = covered_df['hosp_count'] / covered_df['population']

    covered_df.rename(columns={"population": "sample_size"}, inplace=True)

    # drop columns
    covered_df.drop(['hosp_count'], axis=1, inplace=True)

    ###############################
    # RE-ATTACH
    ###############################
    # bring covered_df and df together.
    # where we have full coverage, lower and upper should be null
    # mean_0 will never be null

    df = pd.concat([df, covered_df], ignore_index=True)

    # assert what we just said will be true in the comments above:
    assert df.loc[has_full_coverage, 'lower_0'].isnull().all(), ("where we have"
        " full coverage, lower_0 should be null")
    assert df.loc[has_full_coverage, 'upper_0'].isnull().all(), ("where we have"
        " full coverage, upper_0 should be null")
    assert df.mean_0.notnull().all(), ("mean_0 should never be null")

    # NOTE, remember, sample size will still have null values, and that's okay
    # we need to keep sample size from here on out.

    if "population" in df.columns:
        print "population was still in columns"
        df.drop("population", axis=1, inplace=True)


    ########################################

    # map measure onto data, just need this for ELMO reqs.
    clean_maps = pd.read_csv("FILEPATH")
    clean_maps = clean_maps[['bundle_id', 'bid_measure']]
    clean_maps.drop_duplicates(inplace=True)
    clean_maps.rename(columns={'bid_measure': 'measure'}, inplace=True)
    # remove null bundles from map
    clean_maps = clean_maps[clean_maps.bundle_id.notnull()]

    pre_shape = df.shape[0]  # store for comparison after merge

    # merge measure onto hosp data using bundle_id
    df = df.merge(clean_maps, how='left', on='bundle_id')
    assert pre_shape == df.shape[0], "number of rows don't match after merge."

    # get injuries bids so we can check for missing measures
    pc_injuries = pd.read_csv("FILEPATH")
    inj_bids = pc_injuries['Level1-Bundle ID'].unique()

    # some injuries bids didn't get measures!
    assert set(df[df.measure.isnull()].bundle_id).issubset(set(inj_bids)),\
        ("We expect that all null measures belong to injuries, but that is"
         "not the case. Something went wrong!")

    # fix any injuries that are missing measure, all inj are inc:
    df.loc[(df.measure.isnull())&(df.bundle_id.isin(inj_bids)), 'measure'] = 'inc'

    assert df.measure.isnull().sum() == 0, ("There are null values and we "
        "expect none")

    # read in correction factors (again)
    correction_factors = pd.read_csv("FILEPATH")

    correction_factors.drop("outpatient", axis=1, inplace=True)

    # rename columns to match df
    correction_factors.rename(columns={'sex': 'sex_id'}, inplace=True)

    # merge corr factors onto data
    df = df.merge(correction_factors, how='left', on=['age_start', 'sex_id',
                  'bundle_id'])
    assert pre_shape == df.shape[0] , ("You unexpectedly added rows while "
        "merging on the correction factors. Don't do that!")

    # if a Bundle ID doesn't have a corr factor from marketscan use 1
    # df.update(df[['a','b','c']].fillna(0))  # test code
    # http://stackoverflow.com/questions/36556256/how-do-i-fill-na-values-in-multiple-columns-in-pandas
    df.update(df[['indv_cf', 'incidence', 'prevalence', 'injury_cf']].fillna(1))

    # rename correction factors to match what we told people they would be
    df.rename(columns={'indv_cf': 'correction_factor_1',
                       'incidence': 'correction_factor_2',
                       'prevalence': 'correction_factor_3',
                       'injury_cf': 'correction_factor_inj'},
                       inplace=True)

    # NOTE we apply every correction factor to all data, even if it is not
    # relevant.  E.g., not all data is injuries, so not all data needs
    # correction_factor_inj.  It simply easier to apply all of them, and then
    # while writing to modeler's folders, drop the irrelevant columns.

    # make mean_1, lower_1, upper_1
    df['mean_1'] = df.correction_factor_1 * df.mean_0
    df['lower_1'] = df.correction_factor_1 * df.lower_0
    df['upper_1'] = df.correction_factor_1 * df.upper_0

    # make mean_2, lower_2, upper_2
    df['mean_2'] = df.correction_factor_2 * df.mean_0
    df['lower_2'] = df.correction_factor_2 * df.lower_0
    df['upper_2'] = df.correction_factor_2 * df.upper_0

    # make mean_3, lower_3, upper_3
    df['mean_3'] = df.correction_factor_3 * df.mean_0
    df['lower_3'] = df.correction_factor_3 * df.lower_0
    df['upper_3'] = df.correction_factor_3 * df.upper_0

    # make injury mean, lower, upper
    df['mean_inj'] = df.correction_factor_inj * df.mean_0
    df['lower_inj'] = df.correction_factor_inj * df.lower_0
    df['upper_inj'] = df.correction_factor_inj * df.upper_0

    # assert what we just said will be true in the comments above:
    levels=["1", "2", "3", "inj"]
    for level in levels:
        assert df.loc[has_full_coverage, 'lower_{}'.format(level)].isnull().all(), ("broke on level {}".format(level))
        assert df.loc[has_full_coverage, 'upper_{}'.format(level)].isnull().all(), ("broke on level {}".format(level))
        assert df["mean_{}".format(level)].notnull().all(), ("broke on level {}".format(level))

    def factor_applier(df, levels=["1", "2", "3", "inj"]):
        for level in levels:
            # initialize cols
            df['test_mean_' + level] = np.nan
            df['test_lower_' + level] = np.nan
            df['test_upper_' + level] = np.nan
            # apply corr factors
            df['test_mean_' + level], df['test_lower_' + level], df['test_upper_' + level] = df["correction_factor_" + level] * df['mean_0'], df["correction_factor_" + level] * df['lower_0'], df["correction_factor_" + level] * df['upper_0']
        return(df)
    df = factor_applier(df)

    levels=["1", "2", "3", "inj"]
    for level in levels:
        assert (df.loc[df["mean_" + level].notnull(), "mean_" + level] == df.loc[df["test_mean_" + level].notnull(), "test_mean_" + level]).all(), ("different on level {}".format(level))
        assert (df.loc[df["upper_" + level].notnull(), "upper_" + level] == df.loc[df["test_upper_" + level].notnull(), "test_upper_" + level]).all(), ("different on level {}".format(level))
        assert (df.loc[df["lower_" + level].notnull(), "lower_" + level] == df.loc[df["test_lower_" + level].notnull(), "test_lower_" + level]).all(), ("different on level {}".format(level))
    # drop test cols for now until we run this for awhile without
    # tripping the assert
    test_cols = df.columns[df.columns.str.startswith("test_")]
    df.drop(test_cols, axis=1, inplace=True)

    # RULE = if correction factor is greater than 50, make the data null
    # EXCEPTIONS are made for these bundles, which are capped at 100:
        # Preterm: 80, 81, 82, 500
        # Encephalopathy: 338
        # Sepsis: 92
        # Hemoloytic: 458
        # PAD/PUD: 345
        # Cirrhosis: 131

    # list of bundles which can have correction factors above 50
    cf_exceptions = [345, 80, 81, 82, 500, 338, 92, 458, 131]

    # NOTE when checking the number of nulls, consider the nulls that are caused
    # by the sample_size split

    def mean_capper(df, exceptions, levels=["1", "2", "3"]):
        exception_condition = df.bundle_id.isin(cf_exceptions)
        for level in levels:
            df.loc[(~exception_condition) & (df['correction_factor_' + level] > 50), ['mean_' + level, 'lower_' + level, 'upper_' + level, ]] = np.nan
            df.loc[(exception_condition) & (df['correction_factor_' + level] > 100), ['mean_' + level, 'lower_' + level, 'upper_' + level, ]] = np.nan

        return(df)
    # create df to test function
    df_test_capper = mean_capper(df, cf_exceptions)


    exception_condition = df.bundle_id.isin(cf_exceptions)  # make boolean mask that says if a bundle is in the list

    df.loc[(~exception_condition)&(df.correction_factor_1 > 50), ['mean_1', 'lower_1', 'upper_1']] = [np.nan, np.nan, np.nan]  # DOESN'T DO ANYTHING don't really need to apply here
    df.loc[(~exception_condition)&(df.correction_factor_2 > 50), ['mean_2', 'lower_2', 'upper_2']] = [np.nan, np.nan, np.nan]  # DID DO SOMETHING, affects 6% of prev rows, 0.01% of inc rows
    df.loc[(~exception_condition)&(df.correction_factor_3 > 50), ['mean_3', 'lower_3', 'upper_3']] = [np.nan, np.nan, np.nan]  # DOES A LOT, affects 57% percent of prevalence rows

    df.loc[(exception_condition)&(df.correction_factor_1 > 100), ['mean_1', 'lower_1', 'upper_1']] = [np.nan, np.nan, np.nan]
    df.loc[(exception_condition)&(df.correction_factor_2 > 100), ['mean_2', 'lower_2', 'upper_2']] = [np.nan, np.nan, np.nan]
    df.loc[(exception_condition)&(df.correction_factor_3 > 100), ['mean_3', 'lower_3', 'upper_3']] = [np.nan, np.nan, np.nan]


    #####################################################
    # CHECK that lower < mean < upper
    #####################################################
    # loop over every level of correction
    # can't compare null values, null comparisons always eval to False
    for i in ["0", "1", "2", "3", 'inj']:
        # lower < mean
        assert (df.loc[df['lower_'+i].notnull(), 'lower_'+i] <=
                df.loc[df["lower_"+i].notnull(), 'mean_'+i]).all(),\
            "lower_{} should be less than mean_{}".format(i, i)
        # mean < lower
        assert (df.loc[df["upper_"+i].notnull(), 'mean_'+i] <=
                df.loc[df["upper_"+i].notnull(), 'upper_'+i]).all(),\
            "mean_{} should be less than upper_{}".format(i, i)

    # compare the results between test df and proper df
    for uncertainty in ["mean", "upper", "lower"]:
        for level in ["1", "2", "3"]:
            # compare the sum of nulls rows between dfs
            assert df[uncertainty + "_" + level].isnull().sum() ==\
                df_test_capper[uncertainty + "_" + level].isnull().sum(),\
                "The new capping function is producing different results"

    # write the maternal denominator data, this is for the future when we work
    # in parallel
    if write_maternal_denom:
        def write_maternal_denom(df):
            mat_df = df[df.bundle_id==1010].copy()
            mat_df = mat_df.query("sex_id == 2 & age_start >=10 & age_end <=54")

            if mat_df.shape[0] == 0:
                return
            # NOTE sample size is dropped here, and we make a new one in the
            # following code
            mat_df = mat_df[['location_id', 'year_start', 'year_end',
                             'age_start', 'age_end', 'sex_id',
                             'mean_0', 'mean_1', 'mean_2', 'mean_3']].copy()

            bounds = ['upper_0', 'upper_1', 'upper_2', 'upper_3',
                        'lower_0', 'lower_1', 'lower_2', 'lower_3']
            for uncertainty in bounds:
                mat_df[uncertainty] = np.nan

            # PREP FOR POP ####################################################
            # we don't have years that we can merge on pop to yet, because
            # we aggregated to year bands; another problem with this method
            mat_df['year_id'] = mat_df.year_start + 2  # makes 2000,2005,2010

            # bunch of age things so we can use age_group_id to get pop
            age_group = query("QUERY")
            # correct age groups
            age_group.loc[age_group['age_group_years_end'] > 1,
                          'age_group_years_end'] =\
                age_group.loc[age_group['age_group_years_end'] > 1,
                              'age_group_years_end'] - 1

            # df of unique ages from hospital data to merge onto age_group map
            mat_df_ages = pd.DataFrame([mat_df.age_start.unique(),
                                       mat_df.age_end.unique()]).transpose()
            mat_df_ages.columns = ['age_group_years_start',
                                   'age_group_years_end']
            mat_df_ages = mat_df_ages.merge(age_group, how='left',
                                            on=['age_group_years_start',
                                                'age_group_years_end'])

            # this is the correct terminal age group (even though we use max
            # age = 99)
            mat_df_ages.loc[mat_df_ages['age_group_years_start'] == 95,
                            'age_group_id'] = 235

            # there are two age_group_ids for age_start=0 and age_start=1
            mat_df_ages = mat_df_ages[mat_df_ages.age_group_id != 161]

            # create age/year/location lists to use for pulling population
            age_list = list(mat_df_ages.age_group_id.unique())
            loc_list = list(mat_df.location_id.unique())
            year_list = list(mat_df.year_id.unique())

            # GET POP ########################################################
            # pull population and merge on age_start and age_end
            pop = get_population(age_group_id=age_list, location_id=loc_list,
                                 sex_id=[1, 2], year_id=year_list)

            # FORMAT POP ####################################################
            # attach age_start and age_end to population information
            pop = pop.merge(age_group, how='left', on='age_group_id')
            pop.drop(['process_version_map_id', 'age_group_id'], axis=1,
                     inplace=True)

            # rename pop columns to match hospital data columns
            pop.rename(columns={'age_group_years_start': 'age_start',
                                'age_group_years_end': 'age_end'}, inplace=True)

            # correct terminal age group to match our data
            pop.loc[pop['age_end'] == 124, 'age_end'] = 99

            # MERGE POP ######################################################
            demography = ['location_id', 'year_id', 'age_start',
                          'age_end', 'sex_id']

            pre_shape = mat_df.shape[0]  # store for before comparison
            # then merge population onto the hospital data

            # attach pop info to hosp
            mat_df = mat_df.merge(pop, how='left', on=demography)
            assert pre_shape == mat_df.shape[0], ("number of rows don't "
                "match after merge")


            # MAKE SAMPLE SIZE  ##############################################
            mat_df['sample_size'] = mat_df.population * mat_df.mean_0

            # DROP intermidiate columns
            mat_df.drop(['population', 'year_id'], axis=1, inplace=True)


            mat_df.to_hdf("FILEPATH", key='df', mode="w")
            # backup copy to _archive
            mat_df.to_hdf("FILEPATH", key='df', mode='w')
        write_maternal_denom(df)


    if adjust_maternal_denom:
        def adjust_maternal_denom(df):

            # drop sample_size, UTLAs already had it, but we need it for
            # everything, so we have to drop it.
            df.drop('sample_size', axis=1, inplace=True)
            df = df.query("sex_id == 2 & age_start >=10 & age_end <=54")

            # read in maternal denoms, this is needed when our process is
            # parallelized
            denom = pd.read_hdf("FILEPATH", key="df")
            # denom.drop('bundle_id', axis=1, inplace=True)
            denom_cols = sorted(denom.filter(regex="[0-9]$").columns)
            for col in denom_cols:
                denom.rename(columns={col: col + "_denominator"}, inplace=True)
            pre = df.shape[0]
            df = df.merge(denom, how='left', on=['location_id', 'year_start',
                                                 'year_end', 'age_start',
                                                 'age_end', 'sex_id'])
            assert pre == df.shape[0], ("shape should not have changed "
                "during merge")
            #print(df[df.mean_0_denominator.isnull()].shape)
            #print(df[df.mean_0_denominator.isnull()])
            df = df[(df['mean_0'] > 0) | (df['mean_0_denominator'].notnull())]

            assert df.mean_0_denominator.isnull().sum() == 0, ("shouldn't be "
                "any null values in this column")
            # regex to find the columns that start with l, m or u and end with
            #  a digit
            num_cols = sorted(df.filter(regex="^[lmu].*[0-9]$").columns)
            denom_cols =\
                sorted(df.columns[df.columns.str.endswith("denominator")])

            # divide each bundle value by bundle 1010 to get the adjusted rate
            for i in np.arange(0, 12, 1):
                df[num_cols[i]] = df[num_cols[i]] / df[denom_cols[i]]
            # drop the denominator columns
            df.drop(denom_cols, axis=1, inplace=True)
            # can't divide by zero
            df = df[df['sample_size'] != 0]
            # RETURN ONLY THE MATERNAL DATA
            return(df)
        df = adjust_maternal_denom(df)

    return(df)
コード例 #4
0
def prepare_envelope(df,
                     fix_maternal=False,
                     return_only_inj=False,
                     apply_age_sex_restrictions=True,
                     want_to_drop_data=True):
    """
    function that accepts as a parameter hospital dataframe and adds the
    the envelope to it, and adds meids and bundle_ids to it, duplicating
    baby sequela in the process (as expected)

    Parameters:
    df: DataFrame
        contains hospital inpatient primary diagnosis that has been mapped to
        Baby Sequelae
    fix_maternal: Boolean
        Should always be on, it's just an option for testing purposes to save
        time.
        switch that runs fix_maternal_denominators.  This function transforms
        hospital data from 'population' to 'live births' using Age Specific
        Fertility Rates.
    return_only_inj: Boolean
        switch that will make it so this function only returns injuries data.
        should always be off, it's just there for testing.
    apply_age_sex_restrictions: Boolean
        switch that if True, will apply age and sex restrictions as determined
        by cause_set_id=9.  DUSERts to true.  Useful if you want to look at
        data all the data that was present in the source.
    """

    ###############################################
    # DROP DATA
    ###############################################

    if want_to_drop_data:
        df = hosp_prep.drop_data(df, verbose=False)

    ###############################################
    # MAP AGE_GROUP_ID ONTO DATA
    ###############################################
    # These are our weird hospital age_group_id: we use 235 for 95-99, and we
    # don't use age_group_ids 2,3,4, we use 28 in their place
    ages = hosp_prep.get_hospital_age_groups(
    )  # df that has age_group_id, age_start,
    # and age_end
    df = df.merge(ages, how='left', on=['age_start', 'age_end'])

    # comment out to test the new sources in the process
    if not df.age_group_id.notnull().all():
        warnings.warn("""Shouldn't be any null age_group_id, there could be
        unsplit ages in the data, or maybe an age_start/age_end changed""")

    # maps = pd.read_csv("FILEPATH")
    # maps = maps[['nonfatal_cause_name', 'cause_id', 'level']].copy()
    # maps = maps[maps.cause_id.notnull()]
    # maps = maps[maps.level == 1]
    # maps.drop("level", axis=1, inplace=True)# no need to merge level onto data
    # maps.drop_duplicates(inplace=True)
    #
    # pre = df.shape[0]
    # df = df.merge(maps, how='left', on='nonfatal_cause_name')
    # assert df.shape[0] == pre, "df shape changed during merge"

    #########################################################
    # Get denominator for sources where we don't want to use envelope
    #########################################################

    # Make list of sources that we don't want to use envelope
    full_coverage_sources = ["UK_HOSPITAL_STATISTICS"]

    if pd.Series(full_coverage_sources).isin(df.source.unique()).any():
        # make dataframe with sources
        full_coverage_df = df[df.source.isin(full_coverage_sources)].copy()

        # get denominator (aka population)
        full_coverage_df = gbd_hosp_prep.get_sample_size(full_coverage_df)

        full_coverage_df.rename(columns={'population': 'sample_size'},
                                inplace=True)
        # NOTE now there's going be a column "sample_size" that
        # is null for every source except the fully covered ones

        # make product
        full_coverage_df[
            'product'] = full_coverage_df.val / full_coverage_df.sample_size
    else:
        # make empty dataframe so that it exists and wont break anything
        full_coverage_df = pd.DataFrame(columns=list(df.columns) + ['product'])

    # drop these sources from dataframe for now
    df = df[~df.source.isin(full_coverage_sources)].copy()

    #########################################
    # CREATE CAUSE FRACTIONS
    #########################################
    df = hosp_prep.create_cause_fraction(df)

    ###############################################
    # APPLY CAUSE RESTRICTIONS
    ###############################################

    # NOTE it is okay to apply corrections after cause fractions, because restrictions
    # were also applied before the cause fractions were made ( right before splitting)
    if apply_age_sex_restrictions:
        df = hosp_prep.apply_restrictions(df, 'cause_fraction')
        full_coverage_df = hosp_prep.apply_restrictions(
            full_coverage_df, 'product')
        # drop the restricted values
        df = df[df['cause_fraction'].notnull()]
        full_coverage_df = full_coverage_df[
            full_coverage_df['product'].notnull()]

    #########################################
    # APPLY ENVELOPE
    #########################################
    # read envelope
    env_df = pd.read_csv("FILEPATH")
    locs = df.location_id.unique()  # all locations in our data
    # find locations that are missing age end 99
    missing_99 = list(set(locs) -\
        set(env_df.loc[(env_df.location_id.isin(locs))&(env_df.age_end == 99),
                       'location_id'].unique()))
    # change their age_ends to be 99
    env_df.loc[(env_df.location_id.isin(missing_99)) &
               (env_df.age_start == 95), 'age_end'] = 99
    # since we do NOT want to merge on by age_group_id
    env_df.drop('age_group_id', axis=1, inplace=True)

    demography = [
        'location_id', 'year_start', 'year_end', 'age_start', 'age_end',
        'sex_id'
    ]

    # merge envelope onto data
    pre_shape = df.shape[0]
    df = df.merge(env_df, how='left', on=demography)
    assert pre_shape == df.shape[0],\
        "The merge duplicated rows unexpectedly"

    # compute nfc hospitalization rate
    # aka "apply the envelope"
    df['product'] = df['cause_fraction'] * df['mean']
    df['upper_product'] = df['cause_fraction'] * df['upper']
    df['lower_product'] = df['cause_fraction'] * df['lower']

    ############################################
    # RE-ATTACH data that has sample size instead of
    ############################################

    df = pd.concat([df, full_coverage_df]).reset_index(drop=True)
    # NOTE now there's going be a column "sample_size" and "cases" that
    # is null for every source except the fully covered ones

    # if we just want injuries:
    if return_only_inj == True:
        inj_me = pd.read_csv("FILEPATH")
        inj_me = list(inj_me['level1_meid'])
        df = df[df['modelable_entity_id'].isin(inj_me)]

    # drop columns we don't need anymore
    # cause fraction: finished using it, was used to make product
    # mean: finished using it, was used to make product
    # upper: finished using, it was used ot make product_upper
    # lower: finished using, it was used ot make product_lower
    # val, numerator, denomantor: no longer in count space
    # NON FATAL_CAUSE_NAME IS LEFT TO SIGNIFY THAT THIS DATA IS STILL AT THE
    # NONFATAL_CAUSE_NAME LEVEL
    df.drop([
        'cause_fraction', 'mean', 'upper', 'lower', 'val', 'numerator',
        'denominator'
    ],
            axis=1,
            inplace=True)

    if "bundle_id" in df.columns:
        level_of_analysis = "bundle_id"
    if "nonfatal_cause_name" in df.columns:
        level_of_analysis = "nonfatal_cause_name"
    # write some summary stats of the env being applied
    # this is to review against our aggregated data to find large differences
    today = datetime.datetime.today().strftime("%Y_%m_%d").replace(":", "_")
    # Zeros can really skew the summary stats so drop them
    env_stats = df[df['product'] != 0].\
        groupby(['location_id', level_of_analysis])['product'].\
        describe().reset_index()
    env_stats.rename(columns={'level_2': 'stats'}, inplace=True)
    env_stats.to_csv("FILEPATH", index=False)

    return (df)