コード例 #1
0
ファイル: prep_for_env.py プロジェクト: zhouxm4/ihme-modeling
def apply_restricts(df, full_coverage_df):
    """
    Apply the age-sex restrictions after expanding data to the bundle level
    """

    df = hosp_prep.apply_bundle_restrictions(df, 'cause_fraction')
    full_coverage_df = hosp_prep.apply_bundle_restrictions(
        full_coverage_df, 'mean')
    # drop the restricted values
    df = df[df['cause_fraction'].notnull()]

    full_coverage_df =\
        full_coverage_df[full_coverage_df['mean'].notnull()]

    return df, full_coverage_df
コード例 #2
0
def make_square(df):
    """
    Function that inserts zeros for demographic and etiolgy groups that were not
    present in the source.  A zero is inserted for every age/sex/etiology
    combination for each location and for only the years available for that
    location. If a location has data from 2004-2005 we will create explicit
    zeroes for those years but not for any others.

    Age and sex restrictions are applied after the data is made square.

    Parameters:
        df: Pandas DataFrame
            Must be aggregated and collapsed to the bundle level.
    """
    start_square = time.time()
    assert "bundle_id" in df.columns, "'bundle_id' must exist"
    assert "nonfatal_cause_name" not in df.columns, (
        "Data must not be at the", " nonfatal_cause_name level")

    # create a series of sorted, non-zero mean values to make sure
    # the func doesn't alter anything
    check_mean = df.loc[df['mean_raw'] > 0, 'mean_raw'].sort_values().\
        reset_index(drop=True)

    # square the dataset
    df = hosp_prep.make_zeros(
        df,
        etiology='bundle_id',
        cols_to_square=['mean_raw'],  #, 'upper_raw', 'lower_raw'],
        icd_len=5)

    # assert the sorted means are identical
    assert (check_mean == df.loc[df['mean_raw'] > 0, 'mean_raw'].sort_values().\
        reset_index(drop=True)).all()

    for col in ['mean_raw']:  #, 'upper_raw', 'lower_raw']:
        df = hosp_prep.apply_bundle_restrictions(df, col)

    # assign upper and lower to 0 where mean is 0
    df.loc[df['mean_raw'] == 0, ['upper_raw', 'lower_raw']] = 0

    print("Data made square and age/sex restrictions applied in {} min".\
        format((time.time() - start_square)/60))

    return df
コード例 #3
0
def apply_env_main(df,
                   full_coverage_df,
                   env_path,
                   run_tmp_unc=False,
                   write=False,
                   read_cores=15):
    back = df.copy()
    starting_bundles = df.bundle_id.unique()
    # delete existing env*CF files and re-run them again
    if run_tmp_unc:
        delete_uncertainty_tmp()
        make_uncertainty_jobs(df)
        job_holder()
        fix_failed_jobs()

    # hold the script up until all the uncertainty jobs are done
    job_holder()

    # rough check to see if the length of new files is what we'd
    # expect
    if run_tmp_unc:
        check_len = df.age_group_id.unique().size *\
                df.sex_id.unique().size *\
                df.year_start.unique().size
        actual_len = len(glob.glob(r"FILEPATH/*.H5"))
        assert actual_len == check_len,\
            "Check the error logs, it looks like something went wrong while re-writing files"

    print("Merging on the draw quantiles from the envelope * corrections")
    df = env_merger(df, read_cores=read_cores)

    # apply the hospital envelope
    print("Applying the uncertainty to cause fractions")
    df = apply_env(df)

    df = reattach_covered_data(df, full_coverage_df)

    df = drop_cols(df)

    # reapply age/sex restrictions
    df = hosp_prep.apply_bundle_restrictions(df, 'mean')

    df['bundle_id'] = df['bundle_id'].astype(float)
    int_cols = ['sex_id', 'location_id', 'year_start', 'year_end']
    for col in int_cols:
        df[col] = df[col].astype(int)

    end_bundles = df.bundle_id.unique()

    diff = set(starting_bundles).symmetric_difference(set(end_bundles))

    if len(diff) > 0:
        print("The difference in bundles is {}. Reprocessing these without correction"\
            r" factor uncertainty.".format(diff))
        redo_df = back[back.bundle_id.isin(diff)]
        del back

        redo_df = aeo.apply_envelope_only(df=redo_df,
                                          env_path=env_path,
                                          return_only_inj=False,
                                          apply_age_sex_restrictions=False,
                                          want_to_drop_data=False,
                                          create_hosp_denom=False,
                                          apply_env_subroutine=True,
                                          fix_norway_subnat=False)
        pre_cols = df.columns
        df = pd.concat([df, redo_df], ignore_index=True)
        # make sure columns keep the right dtype
        for col in [
                'sex_id', 'age_group_id', 'bundle_id', 'location_id',
                'year_start', 'year_end', 'nid', 'representative_id'
        ]:
            df[col] = pd.to_numeric(df[col], errors='raise')

        post_cols = df.columns
        print("{} change in columns".format(set(pre_cols).\
            symmetric_difference(post_cols)))

    if write:
        print("Writing the data to FILEPATH...")
        file_path = "FILEPATH/apply_env.H5"
        hosp_prep.write_hosp_file(df,
                                  file_path,
                                  backup=True,
                                  include_version_info=True)

    return df
コード例 #4
0
def aggregate_to_bundle(df, write_maternal_denom=False,
                        adjust_maternal_denom=False):
    """
    Takes a dataframe aggregated to baby sequela level and returns
    the df aggregated to 5 year bands at the bundle level
    """
    ##############################
    # PULL POPULATION
    #
    # This is preparation for aggregating to 5 year bands
    ##############################

    # get rid of all corrected values and correction factors. we want to do
    # this because we are going to aggregate to 5 year bands and the bundle_id
    # level of analysis. We intend to makes cells where the correction factor
    # is over 50 NULL. it doesn't make sense to do that before aggregating,
    # because any rows with nulls in them would be lost entirely.  It doesn't
    # make sense to apply corrections and NOT delete cells that break the
    # over-50 rule before aggregation either cuz we would lose the correction
    # factors during the groupby. Also, it would not be robust to aggregate
    # a 16 columns. Therefore, it's best to apply corrections after aggregating.

    # Technically, the groupby would get rid of these for us. Honestly, this
    # just makes things easier to work with up to that point.
    df.drop(['mean_1', 'mean_2', 'mean_3', 'mean_inj',
             'upper_1', 'upper_2', 'upper_3', 'upper_inj',
             'lower_1', 'lower_2', 'lower_3', 'lower_inj',
             'correction_factor_1', 'correction_factor_2',
             'correction_factor_3', 'correction_factor_inj'],
            axis=1, inplace=True)

    if write_maternal_denom:
        df = df[df['bundle_id'] == 1010]
    # drop the non maternal data pronto
    if adjust_maternal_denom:
        # GET MATERNAL CAUSES
        causes = get_cause_metadata(cause_set_id=9)
        condition = causes.path_to_top_parent.str.contains("366")  # 366 happens
        # to always be in the third level
        # subset just causes that meet the condition sdf
        maternal_causes = causes[condition]
        # make list of maternal causes
        maternal_list = list(maternal_causes['cause_id'].unique())
        # get bundle to cause map
        bundle_cause = query("QUERY")
        # merge cause_id onto data
        df = df.merge(bundle_cause, how='left', on='bundle_id')
        # keep only maternal causes
        df = df[df['cause_id'].isin(maternal_list)]
        # drop cause_id
        df.drop('cause_id', axis=1, inplace=True)
        # drop the denominator bundle
        df = df[df['bundle_id'] != 1010]
    # pull age_group to age_start/age_end map

    ##############################
    # GET POPULATION
    ##############################
    age_group = query("QUERY")
    # correct age groups
    age_group.loc[age_group['age_group_years_end'] > 1, 'age_group_years_end']\
    = age_group.\
    loc[age_group['age_group_years_end'] > 1, 'age_group_years_end'] - 1

    # df of unique ages from hospital data to merge onto age_group map
    df_ages = pd.DataFrame([df.age_start.unique(), df.age_end.unique()]).transpose()
    df_ages.columns = ['age_group_years_start', 'age_group_years_end']
    df_ages = df_ages.merge(age_group, how='left', on=['age_group_years_start',
                                                       'age_group_years_end'])

    # this is the correct terminal age group (even though we use max age = 99)
    df_ages.loc[df_ages['age_group_years_start'] == 95, 'age_group_id'] = 235

    # there are two age_group_ids for age_start=0 and age_start=1
    df_ages = df_ages[df_ages.age_group_id != 161]

    # create age/year/location lists to use for pulling population
    age_list = list(df_ages.age_group_id.unique())
    loc_list = list(df.location_id.unique())
    year_list = list(df.year_start.unique())

    # pull population and merge on age_start and age_end
    pop = get_population(age_group_id=age_list, location_id=loc_list,
                         sex_id=[1, 2], year_id=year_list)

    # attach age_start and age_end to population information
    pop = pop.merge(age_group, how='left', on='age_group_id')
    pop.drop(['process_version_map_id', 'age_group_id'], axis=1, inplace=True)

    # rename pop columns to match hospital data columns
    pop.rename(columns={'age_group_years_start': 'age_start',
                        'age_group_years_end': 'age_end',
                        'year_id': 'year_start'}, inplace=True)
    pop['year_end'] = pop['year_start']

    # correct terminal age group to match our data
    pop.loc[pop['age_end'] == 124, 'age_end'] = 99

    demography = ['location_id', 'year_start', 'year_end', 'age_start',
                  'age_end', 'sex_id']

    ##############################
    # MAKE DATA SQUARE
    ##############################

    # create a series of sorted, non-zero mean values to make sure
    # the func doesn't alter anything
    check_mean = df.loc[df['mean_0'] > 0, 'mean_0'].sort_values().\
        reset_index(drop=True)

    print("Starting number of rows: {}".format(df.shape[0]))
    # square the dataset
    df = hosp_prep.make_zeroes(df, level_of_analysis='bundle_id',
            cols_to_square=['mean_0', 'upper_0', 'lower_0'],
            icd_len=5)
    # assert the sorted means are identical
    assert (check_mean == df.loc[df['mean_0'] > 0, 'mean_0'].sort_values().\
        reset_index(drop=True)).all()


    # delete rows where restrictions should be applied
    # create df where baby sequela are missing
    missing_nfc = df[df.nonfatal_cause_name.isnull()].copy()
    df = df[df.nonfatal_cause_name.notnull()]
    for col in ['mean_0', 'upper_0', 'lower_0']:
        df = hosp_prep.apply_restrictions(df, col)
        missing_nfc = hosp_prep.apply_bundle_restrictions(missing_nfc, col)
        df = df[df[col].notnull()]

    df = pd.concat([df, missing_nfc])
    print("Square number of rows: {}".format(df.shape[0]))
    # don't create the parent injury dupes until after the data is totally
    # square so that the denominators will match
    # df = get_parent_injuries(df)

    # check_parent_injuries(df, 'mean_0')

    pre_shape = df.shape[0]  # store for before comparison
    # then merge population onto the hospital data

    df = df.merge(pop, how='left', on=demography)  # attach pop info to hosp
    assert pre_shape == df.shape[0], "number of rows don't match after merge"

    ##############################
    # RATE SPACE TO COUNTS
    ##############################

    # go from rate space to additive counts
    # because of careful merging we just need to multiply across columns
    df['hosp_count'] = df['mean_0'] * df['population']
    df['upper_hosp_count'] = df['upper_0'] * df['population']
    df['lower_hosp_count'] = df['lower_0'] * df['population']

    # merge on "denominator" from file that was made back in
    # create_cause_fractions. This adds the denominator column, which is the
    # number of admissions in a demographic group
    df = df.merge(pd.read_csv("FILEPATH"),
                  how='left', on=["age_start", "age_end", "sex_id",
                                  "year_start", "year_end", "location_id"])


    # 5 year bins
    df = hosp_prep.year_binner(df)

    # add 5 year NIDs onto data
    df = hosp_prep.five_year_nids(df)


    ##################################################################
    ##################################################################
    #                         THE COLLAPSE                           #
    ##################################################################
    ##################################################################
    # The final collapse to 5 year bands, 5 year nids and bundle ID
    # this is doing a few things at once.  One is that we need to aggregate
    # to 5 year bands.  Another is aggregating to the Bundle_id level of
    # analysis.  Up to this point we were at the nonfatal_cause_name AKA
    # baby sequelae level of analysis.

    # WE NEED TWO COLLAPSES. One for data that doesn't have a sample_size value,
    # and annother for the data that does.  This is because:
    # https://goo.gl/e66OZ4 and https://goo.gl/Fb78xi

    # make df of data where there is full coverage (i.e., the UK)
    full_coverage_sources = ["UK_HOSPITAL_STATISTICS"]

    # make condition mask that indicates rows that have full coverage
    has_full_coverage = df.source.isin(full_coverage_sources)

    covered_df = df[has_full_coverage]

    # drop "denominator" from covered_df
    covered_df = covered_df.drop("denominator", axis=1)

    # drop this data from the main dataframe
    df = df[~has_full_coverage]

    assert (df.loc[df.denominator.isnull(), 'mean_0'] == 0).all(), ("mean_0"
        " should be 0")
    assert (df.loc[df.denominator.isnull(), 'lower_0'] == 0).all(), ("lower_0"
        " should be 0")
    assert (df.loc[df.denominator.isnull(), 'upper_0'] == 0).all(), ("upper_0"
        " should be 0")

    df = df[df.denominator.notnull()]

    # df already has sample size
    df.drop("sample_size", axis=1, inplace=True)
    # check if cases are lost in the groupby

    # rename "denominator" to "sample_size" in df (not covered_df)
    df.rename(columns={"denominator": "sample_size"}, inplace=True)

    pre_cases = df['hosp_count'].sum()
    # can use the same group columns for both dataframes
    groups = ['location_id', 'year_start', 'year_end',
              'age_start', 'age_end', 'sex_id', 'nid',
              'representative_id',
              'bundle_id']

    # sample_size has some null values from being made square, but it was just
    # population, so we're using pop instead. so remember,
    # population == sample_size for covered_df
    covered_df = covered_df.groupby(groups)\
        .agg({'hosp_count':'sum',
              'population': 'sum'}).reset_index()


    # add "sample_size" to the aggregate function
    df = df.groupby(groups).agg({'hosp_count': 'sum',
                                 'upper_hosp_count': 'sum',
                                 'lower_hosp_count': 'sum',
                                 'population': 'sum',
                                 'sample_size': 'sum'}).reset_index()
    assert round(pre_cases, 0) == round(df['hosp_count'].sum(), 0),\
        ("some cases were lost. "
         "From {} to {}".format(pre_cases, df['hosp_count'].sum()))

    # set sample size to np.nan when mean/upper/lower are greater than 0
    df.loc[(df['hosp_count'] > 0) & (df['lower_hosp_count'] > 0) & (df['upper_hosp_count'] > 0), 'sample_size'] = np.nan
    ##############################
    # COUNTS TO RATE SPACE
    ##############################
    # REMAKE mean and uncertainty
    # for the main df:

    df['mean_0'] = df['hosp_count'] / df['population']
    df['lower_0'] = df['lower_hosp_count'] / df['population']
    df['upper_0'] = df['upper_hosp_count'] / df['population']
    df.drop(['hosp_count', 'lower_hosp_count', 'upper_hosp_count'], axis=1,
            inplace=True)

    # add parent injuries
    # NOTE get parent injuries is ran before covered_df is concated with df.
    # it happens to not make a difference at the moment, because there are not
    # injuries in covered_df, but it could in the future.
    df = get_parent_injuries(df)
    for col in ['mean_0', 'lower_0', 'upper_0']:
        hosp_prep.check_parent_injuries(df, col_to_sum=col)

    # this drops the population that was merged on for coverting to counts.
    df.drop('population', axis=1, inplace=True)  # don't need pop anymore

    # for the covered df:
    covered_df['mean_0'] = covered_df['hosp_count'] / covered_df['population']

    covered_df.rename(columns={"population": "sample_size"}, inplace=True)

    # drop columns
    covered_df.drop(['hosp_count'], axis=1, inplace=True)

    ###############################
    # RE-ATTACH
    ###############################
    # bring covered_df and df together.
    # where we have full coverage, lower and upper should be null
    # mean_0 will never be null

    df = pd.concat([df, covered_df], ignore_index=True)

    # assert what we just said will be true in the comments above:
    assert df.loc[has_full_coverage, 'lower_0'].isnull().all(), ("where we have"
        " full coverage, lower_0 should be null")
    assert df.loc[has_full_coverage, 'upper_0'].isnull().all(), ("where we have"
        " full coverage, upper_0 should be null")
    assert df.mean_0.notnull().all(), ("mean_0 should never be null")

    # NOTE, remember, sample size will still have null values, and that's okay
    # we need to keep sample size from here on out.

    if "population" in df.columns:
        print "population was still in columns"
        df.drop("population", axis=1, inplace=True)


    ########################################

    # map measure onto data, just need this for ELMO reqs.
    clean_maps = pd.read_csv("FILEPATH")
    clean_maps = clean_maps[['bundle_id', 'bid_measure']]
    clean_maps.drop_duplicates(inplace=True)
    clean_maps.rename(columns={'bid_measure': 'measure'}, inplace=True)
    # remove null bundles from map
    clean_maps = clean_maps[clean_maps.bundle_id.notnull()]

    pre_shape = df.shape[0]  # store for comparison after merge

    # merge measure onto hosp data using bundle_id
    df = df.merge(clean_maps, how='left', on='bundle_id')
    assert pre_shape == df.shape[0], "number of rows don't match after merge."

    # get injuries bids so we can check for missing measures
    pc_injuries = pd.read_csv("FILEPATH")
    inj_bids = pc_injuries['Level1-Bundle ID'].unique()

    # some injuries bids didn't get measures!
    assert set(df[df.measure.isnull()].bundle_id).issubset(set(inj_bids)),\
        ("We expect that all null measures belong to injuries, but that is"
         "not the case. Something went wrong!")

    # fix any injuries that are missing measure, all inj are inc:
    df.loc[(df.measure.isnull())&(df.bundle_id.isin(inj_bids)), 'measure'] = 'inc'

    assert df.measure.isnull().sum() == 0, ("There are null values and we "
        "expect none")

    # read in correction factors (again)
    correction_factors = pd.read_csv("FILEPATH")

    correction_factors.drop("outpatient", axis=1, inplace=True)

    # rename columns to match df
    correction_factors.rename(columns={'sex': 'sex_id'}, inplace=True)

    # merge corr factors onto data
    df = df.merge(correction_factors, how='left', on=['age_start', 'sex_id',
                  'bundle_id'])
    assert pre_shape == df.shape[0] , ("You unexpectedly added rows while "
        "merging on the correction factors. Don't do that!")

    # if a Bundle ID doesn't have a corr factor from marketscan use 1
    # df.update(df[['a','b','c']].fillna(0))  # test code
    # http://stackoverflow.com/questions/36556256/how-do-i-fill-na-values-in-multiple-columns-in-pandas
    df.update(df[['indv_cf', 'incidence', 'prevalence', 'injury_cf']].fillna(1))

    # rename correction factors to match what we told people they would be
    df.rename(columns={'indv_cf': 'correction_factor_1',
                       'incidence': 'correction_factor_2',
                       'prevalence': 'correction_factor_3',
                       'injury_cf': 'correction_factor_inj'},
                       inplace=True)

    # NOTE we apply every correction factor to all data, even if it is not
    # relevant.  E.g., not all data is injuries, so not all data needs
    # correction_factor_inj.  It simply easier to apply all of them, and then
    # while writing to modeler's folders, drop the irrelevant columns.

    # make mean_1, lower_1, upper_1
    df['mean_1'] = df.correction_factor_1 * df.mean_0
    df['lower_1'] = df.correction_factor_1 * df.lower_0
    df['upper_1'] = df.correction_factor_1 * df.upper_0

    # make mean_2, lower_2, upper_2
    df['mean_2'] = df.correction_factor_2 * df.mean_0
    df['lower_2'] = df.correction_factor_2 * df.lower_0
    df['upper_2'] = df.correction_factor_2 * df.upper_0

    # make mean_3, lower_3, upper_3
    df['mean_3'] = df.correction_factor_3 * df.mean_0
    df['lower_3'] = df.correction_factor_3 * df.lower_0
    df['upper_3'] = df.correction_factor_3 * df.upper_0

    # make injury mean, lower, upper
    df['mean_inj'] = df.correction_factor_inj * df.mean_0
    df['lower_inj'] = df.correction_factor_inj * df.lower_0
    df['upper_inj'] = df.correction_factor_inj * df.upper_0

    # assert what we just said will be true in the comments above:
    levels=["1", "2", "3", "inj"]
    for level in levels:
        assert df.loc[has_full_coverage, 'lower_{}'.format(level)].isnull().all(), ("broke on level {}".format(level))
        assert df.loc[has_full_coverage, 'upper_{}'.format(level)].isnull().all(), ("broke on level {}".format(level))
        assert df["mean_{}".format(level)].notnull().all(), ("broke on level {}".format(level))

    def factor_applier(df, levels=["1", "2", "3", "inj"]):
        for level in levels:
            # initialize cols
            df['test_mean_' + level] = np.nan
            df['test_lower_' + level] = np.nan
            df['test_upper_' + level] = np.nan
            # apply corr factors
            df['test_mean_' + level], df['test_lower_' + level], df['test_upper_' + level] = df["correction_factor_" + level] * df['mean_0'], df["correction_factor_" + level] * df['lower_0'], df["correction_factor_" + level] * df['upper_0']
        return(df)
    df = factor_applier(df)

    levels=["1", "2", "3", "inj"]
    for level in levels:
        assert (df.loc[df["mean_" + level].notnull(), "mean_" + level] == df.loc[df["test_mean_" + level].notnull(), "test_mean_" + level]).all(), ("different on level {}".format(level))
        assert (df.loc[df["upper_" + level].notnull(), "upper_" + level] == df.loc[df["test_upper_" + level].notnull(), "test_upper_" + level]).all(), ("different on level {}".format(level))
        assert (df.loc[df["lower_" + level].notnull(), "lower_" + level] == df.loc[df["test_lower_" + level].notnull(), "test_lower_" + level]).all(), ("different on level {}".format(level))
    # drop test cols for now until we run this for awhile without
    # tripping the assert
    test_cols = df.columns[df.columns.str.startswith("test_")]
    df.drop(test_cols, axis=1, inplace=True)

    # RULE = if correction factor is greater than 50, make the data null
    # EXCEPTIONS are made for these bundles, which are capped at 100:
        # Preterm: 80, 81, 82, 500
        # Encephalopathy: 338
        # Sepsis: 92
        # Hemoloytic: 458
        # PAD/PUD: 345
        # Cirrhosis: 131

    # list of bundles which can have correction factors above 50
    cf_exceptions = [345, 80, 81, 82, 500, 338, 92, 458, 131]

    # NOTE when checking the number of nulls, consider the nulls that are caused
    # by the sample_size split

    def mean_capper(df, exceptions, levels=["1", "2", "3"]):
        exception_condition = df.bundle_id.isin(cf_exceptions)
        for level in levels:
            df.loc[(~exception_condition) & (df['correction_factor_' + level] > 50), ['mean_' + level, 'lower_' + level, 'upper_' + level, ]] = np.nan
            df.loc[(exception_condition) & (df['correction_factor_' + level] > 100), ['mean_' + level, 'lower_' + level, 'upper_' + level, ]] = np.nan

        return(df)
    # create df to test function
    df_test_capper = mean_capper(df, cf_exceptions)


    exception_condition = df.bundle_id.isin(cf_exceptions)  # make boolean mask that says if a bundle is in the list

    df.loc[(~exception_condition)&(df.correction_factor_1 > 50), ['mean_1', 'lower_1', 'upper_1']] = [np.nan, np.nan, np.nan]  # DOESN'T DO ANYTHING don't really need to apply here
    df.loc[(~exception_condition)&(df.correction_factor_2 > 50), ['mean_2', 'lower_2', 'upper_2']] = [np.nan, np.nan, np.nan]  # DID DO SOMETHING, affects 6% of prev rows, 0.01% of inc rows
    df.loc[(~exception_condition)&(df.correction_factor_3 > 50), ['mean_3', 'lower_3', 'upper_3']] = [np.nan, np.nan, np.nan]  # DOES A LOT, affects 57% percent of prevalence rows

    df.loc[(exception_condition)&(df.correction_factor_1 > 100), ['mean_1', 'lower_1', 'upper_1']] = [np.nan, np.nan, np.nan]
    df.loc[(exception_condition)&(df.correction_factor_2 > 100), ['mean_2', 'lower_2', 'upper_2']] = [np.nan, np.nan, np.nan]
    df.loc[(exception_condition)&(df.correction_factor_3 > 100), ['mean_3', 'lower_3', 'upper_3']] = [np.nan, np.nan, np.nan]


    #####################################################
    # CHECK that lower < mean < upper
    #####################################################
    # loop over every level of correction
    # can't compare null values, null comparisons always eval to False
    for i in ["0", "1", "2", "3", 'inj']:
        # lower < mean
        assert (df.loc[df['lower_'+i].notnull(), 'lower_'+i] <=
                df.loc[df["lower_"+i].notnull(), 'mean_'+i]).all(),\
            "lower_{} should be less than mean_{}".format(i, i)
        # mean < lower
        assert (df.loc[df["upper_"+i].notnull(), 'mean_'+i] <=
                df.loc[df["upper_"+i].notnull(), 'upper_'+i]).all(),\
            "mean_{} should be less than upper_{}".format(i, i)

    # compare the results between test df and proper df
    for uncertainty in ["mean", "upper", "lower"]:
        for level in ["1", "2", "3"]:
            # compare the sum of nulls rows between dfs
            assert df[uncertainty + "_" + level].isnull().sum() ==\
                df_test_capper[uncertainty + "_" + level].isnull().sum(),\
                "The new capping function is producing different results"

    # write the maternal denominator data, this is for the future when we work
    # in parallel
    if write_maternal_denom:
        def write_maternal_denom(df):
            mat_df = df[df.bundle_id==1010].copy()
            mat_df = mat_df.query("sex_id == 2 & age_start >=10 & age_end <=54")

            if mat_df.shape[0] == 0:
                return
            # NOTE sample size is dropped here, and we make a new one in the
            # following code
            mat_df = mat_df[['location_id', 'year_start', 'year_end',
                             'age_start', 'age_end', 'sex_id',
                             'mean_0', 'mean_1', 'mean_2', 'mean_3']].copy()

            bounds = ['upper_0', 'upper_1', 'upper_2', 'upper_3',
                        'lower_0', 'lower_1', 'lower_2', 'lower_3']
            for uncertainty in bounds:
                mat_df[uncertainty] = np.nan

            # PREP FOR POP ####################################################
            # we don't have years that we can merge on pop to yet, because
            # we aggregated to year bands; another problem with this method
            mat_df['year_id'] = mat_df.year_start + 2  # makes 2000,2005,2010

            # bunch of age things so we can use age_group_id to get pop
            age_group = query("QUERY")
            # correct age groups
            age_group.loc[age_group['age_group_years_end'] > 1,
                          'age_group_years_end'] =\
                age_group.loc[age_group['age_group_years_end'] > 1,
                              'age_group_years_end'] - 1

            # df of unique ages from hospital data to merge onto age_group map
            mat_df_ages = pd.DataFrame([mat_df.age_start.unique(),
                                       mat_df.age_end.unique()]).transpose()
            mat_df_ages.columns = ['age_group_years_start',
                                   'age_group_years_end']
            mat_df_ages = mat_df_ages.merge(age_group, how='left',
                                            on=['age_group_years_start',
                                                'age_group_years_end'])

            # this is the correct terminal age group (even though we use max
            # age = 99)
            mat_df_ages.loc[mat_df_ages['age_group_years_start'] == 95,
                            'age_group_id'] = 235

            # there are two age_group_ids for age_start=0 and age_start=1
            mat_df_ages = mat_df_ages[mat_df_ages.age_group_id != 161]

            # create age/year/location lists to use for pulling population
            age_list = list(mat_df_ages.age_group_id.unique())
            loc_list = list(mat_df.location_id.unique())
            year_list = list(mat_df.year_id.unique())

            # GET POP ########################################################
            # pull population and merge on age_start and age_end
            pop = get_population(age_group_id=age_list, location_id=loc_list,
                                 sex_id=[1, 2], year_id=year_list)

            # FORMAT POP ####################################################
            # attach age_start and age_end to population information
            pop = pop.merge(age_group, how='left', on='age_group_id')
            pop.drop(['process_version_map_id', 'age_group_id'], axis=1,
                     inplace=True)

            # rename pop columns to match hospital data columns
            pop.rename(columns={'age_group_years_start': 'age_start',
                                'age_group_years_end': 'age_end'}, inplace=True)

            # correct terminal age group to match our data
            pop.loc[pop['age_end'] == 124, 'age_end'] = 99

            # MERGE POP ######################################################
            demography = ['location_id', 'year_id', 'age_start',
                          'age_end', 'sex_id']

            pre_shape = mat_df.shape[0]  # store for before comparison
            # then merge population onto the hospital data

            # attach pop info to hosp
            mat_df = mat_df.merge(pop, how='left', on=demography)
            assert pre_shape == mat_df.shape[0], ("number of rows don't "
                "match after merge")


            # MAKE SAMPLE SIZE  ##############################################
            mat_df['sample_size'] = mat_df.population * mat_df.mean_0

            # DROP intermidiate columns
            mat_df.drop(['population', 'year_id'], axis=1, inplace=True)


            mat_df.to_hdf("FILEPATH", key='df', mode="w")
            # backup copy to _archive
            mat_df.to_hdf("FILEPATH", key='df', mode='w')
        write_maternal_denom(df)


    if adjust_maternal_denom:
        def adjust_maternal_denom(df):

            # drop sample_size, UTLAs already had it, but we need it for
            # everything, so we have to drop it.
            df.drop('sample_size', axis=1, inplace=True)
            df = df.query("sex_id == 2 & age_start >=10 & age_end <=54")

            # read in maternal denoms, this is needed when our process is
            # parallelized
            denom = pd.read_hdf("FILEPATH", key="df")
            # denom.drop('bundle_id', axis=1, inplace=True)
            denom_cols = sorted(denom.filter(regex="[0-9]$").columns)
            for col in denom_cols:
                denom.rename(columns={col: col + "_denominator"}, inplace=True)
            pre = df.shape[0]
            df = df.merge(denom, how='left', on=['location_id', 'year_start',
                                                 'year_end', 'age_start',
                                                 'age_end', 'sex_id'])
            assert pre == df.shape[0], ("shape should not have changed "
                "during merge")
            #print(df[df.mean_0_denominator.isnull()].shape)
            #print(df[df.mean_0_denominator.isnull()])
            df = df[(df['mean_0'] > 0) | (df['mean_0_denominator'].notnull())]

            assert df.mean_0_denominator.isnull().sum() == 0, ("shouldn't be "
                "any null values in this column")
            # regex to find the columns that start with l, m or u and end with
            #  a digit
            num_cols = sorted(df.filter(regex="^[lmu].*[0-9]$").columns)
            denom_cols =\
                sorted(df.columns[df.columns.str.endswith("denominator")])

            # divide each bundle value by bundle 1010 to get the adjusted rate
            for i in np.arange(0, 12, 1):
                df[num_cols[i]] = df[num_cols[i]] / df[denom_cols[i]]
            # drop the denominator columns
            df.drop(denom_cols, axis=1, inplace=True)
            # can't divide by zero
            df = df[df['sample_size'] != 0]
            # RETURN ONLY THE MATERNAL DATA
            return(df)
        df = adjust_maternal_denom(df)

    return(df)