def _prep_for_pe_correction(df, value_cols, pop_col, pop_df, pop_id_cols,
                            correct_garbage):

    if type(value_cols) == str:
        value_cols = [value_cols]
    # make sure the 'flagged' column is set
    assert_flagged_col(df)

    # id cols will be original columns - val columns
    id_cols = list(
        set(df.columns) - set(value_cols) - set(['flagged']) - set([pop_col]))
    if correct_garbage:
        gc_dup_cols = id_cols + ['flagged']
        dup_cols = gc_dup_cols
    else:
        dup_cols = id_cols
    # these should not be duplicated
    dups = df[df[dup_cols].duplicated()]
    if len(dups) > 0:
        raise AssertionError("Found duplicates in non-value columns "
                             "{c}: \{df}".format(c=id_cols, df=dups))

    # add population to the data
    if pop_col not in df.columns:
        if pop_df is None:
            raise AssertionError(
                "If pop_col ('{}') is not already in the dataframe, "
                "pop_df must not be None".format(pop_col))
        df = df.merge(pop_df, on=pop_id_cols, how='left')
    # every row should have a population
    report_if_merge_fail(df, pop_col, pop_id_cols)
    return df
def map_code_id(df, code_map, remove_decimal=True, value_col='value'):

    if remove_decimal:
        df[value_col] = df[value_col].str.replace(".", "")
        code_map[value_col] = code_map[value_col].str.replace(".", "")
        df[value_col] = df[value_col].str.strip()
        code_map[value_col] = code_map[value_col].str.strip()
    code_ids = code_map[['code_system_id', value_col,
                         "code_id"]].drop_duplicates()
    df.loc[df[value_col] == 'acause_digest_gastrititis',
           value_col] = 'acause_digest_gastritis'
    assert not code_ids[['code_system_id', value_col, "code_id"
                         ]].duplicated().values.any()
    df = pd.merge(df, code_ids, on=['code_system_id', value_col], how='left')

    for num_dig_retry in [4, 3]:
        if df.code_id.isnull().any():
            print("Trying mapping again at {} digits...".format(num_dig_retry))
            # try again at 4 digits
            filled_mappings = remap_code_id(df, code_ids, num_dig_retry)
            df = df.loc[df['code_id'].notnull()]
            df = df.append(filled_mappings, ignore_index=True)

    report_if_merge_fail(df, 'code_id', ['code_system_id', value_col])

    return df
Exemple #3
0
def check_age_groups(df):
    '''Raise a series of assertion errors if standard checks for age groups are violated.

        Input: pandas DataFrame
        Returns: None, raises assertion errors

        Checks for:
        - age_group_ids not in shared.ages
        - overlapping age groups
        - lack of terminal age group (e.g. 95+) in VR
    '''
    for nid_etid, nid_etid_df in df.groupby(['nid', 'extract_type_id']):
        nid_etid_df = add_age_metadata(
            nid_etid_df, ['age_group_years_start', 'age_group_years_end'])
        report_if_merge_fail(df, 'age_group_id', 'age_group_years_start')
        report_if_merge_fail(df, 'age_group_id', 'age_group_years_end')
        nid_etid_df = nid_etid_df[[
            'age_group_id', 'age_group_years_start', 'age_group_years_end',
            'data_type_id'
        ]].drop_duplicates().sort_values(
            by=['age_group_years_start', 'age_group_years_end'])

        # NOTE: report_gaps_and_overlaps MUST come before report_too_much_age_detail
        # report_too_much_age_detail cannot handle age groups with gaps or overlaps
        report_gaps_and_overlaps(nid_etid, nid_etid_df)
        report_no_terminal_ages(nid_etid, nid_etid_df)
        report_too_much_age_detail(nid_etid, nid_etid_df)
def collapse_sites(df, **cache_options):
    """
    Collapse sites together and reassign site_id.

    This function exists to collapse sites in historical VA data. When new VA is added to our
    database, sites should be collapsed in formatting, not here.
    """
    # Get some properties so that we don't change things
    start_cols = df.columns
    start_deaths = df.deaths.sum()

    # Get the existing site names
    df = add_site_metadata(df, 'site_name', **cache_options)
    report_if_merge_fail(df, check_col='site_name', merge_cols='site_id')
    old_sites = set(df.site_name.unique())

    # Assign new aggregated sites
    # We assume that within the same study, location, and year, each site was surveyed
    # for the same ages, sexes, and causes, and that any missingness across age, sex,
    # or cause comes from a lack of deaths rather than a lack of observation
    site_dem_cols = ['nid', 'extract_type_id', 'location_id', 'year_id']
    df['agg_site'] = df.groupby(site_dem_cols)['site_name'].transform(
        lambda x: ', '.join(x.drop_duplicates().sort_values()))

    # Collapse the old sites and replace with the aggregate sites
    df = df.groupby(df.columns.drop(['site_id', 'site_name',
                                     'deaths']).tolist(),
                    as_index=False)['deaths'].sum()
    df.rename({'agg_site': 'site_name'}, axis='columns', inplace=True)

    # This is pretty bad, should think of something better
    # Handle aggregated sites that are too long for the db (>250 characters)
    too_long = (df.site_name.str.len() > 250)
    Bangladesh_study = ((df.nid == 243436) & (df.extract_type_id == 1))
    Mozambique_study = ((df.nid == 93710) & (df.extract_type_id == 1))
    df.loc[Bangladesh_study & too_long,
           'site_name'] = '51 upazilas in Bangladesh'
    df.loc[Mozambique_study & too_long,
           'site_name'] = '8 areas in Sofala province'

    # Add new site_ids
    df = add_site_metadata(df,
                           add_cols='site_id',
                           merge_col='site_name',
                           **cache_options)
    report_if_merge_fail(df, check_col='site_id', merge_cols='site_name')

    # Report what changed (if anything)
    new_sites = set(df.site_name.unique())
    if new_sites != old_sites:
        print("Collapsed sites: \n{} \ninto sites: \n{}".format(
            old_sites, new_sites))
    else:
        print("Did not collapse any sites")

    df.drop("site_name", axis='columns', inplace=True)
    assert set(df.columns) == set(start_cols)
    assert np.isclose(df.deaths.sum(), start_deaths)

    return df
def add_covariate(df, covariate_id, covariate_column_name, by_sex=True):

    assert 'location_id' not in df.columns, "Unexpected df structure: has location_id"
    assert 'country' in df.columns, "Unexpected df structure: lacks country"

    merge_cols = ['country', 'year_id', 'sex_id']
    cov_df = None

    if not by_sex:
        merge_cols.remove('sex_id')

    # For covariates passed to me via flat files (aka .csvs), I manually
    # assign them negative covariate values. Handle them as a separate case
    if (covariate_id < 0):
        cov_df, merge_cols = get_flat_covariate_estimates(covariate_id)
    else:
        cov_df = db_queries.get_covariate_estimates(covariate_id, decomp_step="step1")

    cov_df = cov_df.rename(columns={
        'location_id': 'country',
        'mean_value': covariate_column_name
    })

    cov_df = cov_df[merge_cols + [covariate_column_name]]
    report_duplicates(cov_df, merge_cols)
    df = df.merge(cov_df, on=merge_cols, how='left')

    # As of 2/7/19, flat files go from 1990 - 2017, so missing
    # 80s throws an error. I deal with this in regression setup
    if (covariate_id > 0):
        report_if_merge_fail(df, covariate_column_name, merge_cols)
    return df
Exemple #6
0
    def adjust_ukr(self, env_df, split_type):
        orig_id = self.split_ids[split_type][0]
        no_cs = self.split_ids[split_type][1]
        crimea = self.split_ids[split_type][2]
        sev = self.split_ids[split_type][3]

        env_df['prop_no_cs'] = env_df[no_cs] / env_df[orig_id]
        env_df['prop_crimea'] = env_df[crimea] / env_df[orig_id]
        env_df['prop_sev'] = env_df[sev] / env_df[orig_id]
        df = self.df.merge(env_df,
                           on=['age_group_id', 'year_id', 'sex_id'],
                           how='left')
        report_if_merge_fail(df, 'prop_no_cs',
                             ['age_group_id', 'year_id', 'sex_id'])

        no_cs_df = df.copy()
        no_cs_df[
            'sample_size'] = no_cs_df['prop_no_cs'] * no_cs_df['sample_size']
        no_cs_df['location_id'] = no_cs

        crimea_df = df.copy()
        crimea_df['sample_size'] = crimea_df['prop_crimea'] * crimea_df[
            'sample_size']
        crimea_df['location_id'] = crimea

        sev_df = df.copy()
        sev_df['sample_size'] = sev_df['prop_crimea'] * sev_df['sample_size']
        sev_df['location_id'] = sev

        df = pd.concat([no_cs_df, crimea_df, sev_df], ignore_index=True)

        return df
Exemple #7
0
    def calculate_env_coverage(self, df):
        """Calculate the percentage of envelope covered."""
        # demographic variables
        dem_cols = self.geo_cols + ['age_group_id', 'sex_id']

        # prep envelope
        env_df = self.env_meta_df.loc[
            (self.env_meta_df['age_group_id'].isin(self.cod_ages))
            & (self.env_meta_df['sex_id'].isin([1, 2]))]
        env_df = env_df[dem_cols + ['mean_env']]
        env_df['total_env'] = env_df.groupby(
            self.geo_cols)['mean_env'].transform(sum)

        # only keep demographics represented in the data
        df = df[self.source_cols + dem_cols]
        df = df.drop_duplicates()

        # merge on envelope df
        df = df.merge(env_df, on=dem_cols, how='left')
        report_if_merge_fail(df, 'mean_env', dem_cols)

        df['env_covered'] = df.groupby(
            self.source_cols + self.geo_cols)['mean_env'].transform(sum)
        assert not ((df['env_covered'] - df['total_env']) > .0001).any()

        df['pct_env_coverage'] = df['env_covered'] / df['total_env']
        # all VR should be 1
        df.loc[df['data_type_id'].isin([9, 10]), 'pct_env_coverage'] = 1

        df = df.drop(['env_covered', 'total_env'], axis=1)
        df = df[self.source_cols + self.geo_cols +
                ['pct_env_coverage']].drop_duplicates()
        assert not df.duplicated(self.source_cols + self.geo_cols).values.any()

        return df
Exemple #8
0
def aggregate_to_country_level(orig_df, location_set_version_id):
    """Aggregate sub nationals to country level."""
    df = orig_df.copy()

    # merge on country level location_ids
    location_meta_df = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id)
    country_location_ids = \
        get_country_level_location_id(df.location_id.unique(),
                                      location_meta_df)
    df = df.merge(country_location_ids, how='left', on='location_id')
    report_if_merge_fail(df, 'country_location_id', ['location_id'])

    # aggregate sub national locations to national level
    df = df[df['location_id'] != df['country_location_id']]
    df['location_id'] = df['country_location_id']
    df = df.drop(['country_location_id'], axis=1)
    group_cols = [col for col in df.columns if col not in VAL_COLS]
    df = df.groupby(group_cols, as_index=False)[VAL_COLS].sum()
    df['loc_agg'] = 1

    # append aggregates to original dataframe
    orig_df['loc_agg'] = 0
    df = df.append(orig_df)
    return df
Exemple #9
0
def format_for_nr(df, location_hierarchy):
    """Merge on needed location metadata."""
    locs = df[['location_id']].drop_duplicates()
    locs = add_location_metadata(
        locs,
        add_cols=["ihme_loc_id", "path_to_top_parent"],
        merge_col="location_id",
        location_meta_df=location_hierarchy
    )
    report_if_merge_fail(locs, 'path_to_top_parent', 'location_id')
    locs['country_id'] = locs['path_to_top_parent'].str.split(",").apply(
        lambda x: int(x[3]))
    locs['subnat_id'] = locs['ihme_loc_id'].apply(
        lambda x: int(x.split("_")[1]) if "_" in x else 0)
    locs['iso3'] = locs['ihme_loc_id'].str.slice(0, 3)
    different_locations = locs['country_id'] != locs['location_id']

    locs.loc[different_locations, 'iso3'] = \
        locs['iso3'].apply(lambda x: x + "_subnat")
    locs = locs[['location_id', 'country_id', 'subnat_id', 'iso3']]
    df = df.merge(locs, on='location_id', how='left')
    report_if_merge_fail(locs, 'country_id', 'location_id')

    # create indicator column for running a separate nr model
    # for national and subnational locations
    subnational_modeled_iso3s = CONF.get_id('subnational_modeled_iso3s')
    df['is_loc_agg'] = 0
    df.loc[df['iso3'].isin(subnational_modeled_iso3s), 'is_loc_agg'] = 1

    # remove 0 sample size rows
    df = df.loc[df['sample_size'] > 0]

    return df
Exemple #10
0
    def adjust_ukr(self, env_df, split_type):
        """Adjust sample size for Ukraine without Crimea and Sevastopol."""
        orig_id = self.split_ids[split_type][0]
        no_cs = self.split_ids[split_type][1]
        crimea = self.split_ids[split_type][2]
        sev = self.split_ids[split_type][3]

        env_df['prop_no_cs'] = env_df[no_cs] / env_df[orig_id]
        env_df['prop_crimea'] = env_df[crimea] / env_df[orig_id]
        env_df['prop_sev'] = env_df[sev] / env_df[orig_id]
        df = self.df.merge(env_df,
                           on=['age_group_id', 'year_id', 'sex_id'],
                           how='left')
        report_if_merge_fail(df, 'prop_no_cs',
                             ['age_group_id', 'year_id', 'sex_id'])

        # resulting dataset will have Ukraine without Crimea and Sevastopol,
        # Crimea, and Sevastopol location_ids
        no_cs_df = df.copy()
        no_cs_df[
            'sample_size'] = no_cs_df['prop_no_cs'] * no_cs_df['sample_size']
        no_cs_df['location_id'] = no_cs

        crimea_df = df.copy()
        crimea_df['sample_size'] = crimea_df['prop_crimea'] * crimea_df[
            'sample_size']
        crimea_df['location_id'] = crimea

        sev_df = df.copy()
        sev_df['sample_size'] = sev_df['prop_crimea'] * sev_df['sample_size']
        sev_df['location_id'] = sev

        df = pd.concat([no_cs_df, crimea_df, sev_df], ignore_index=True)

        return df
Exemple #11
0
def merge_acause_and_collapse(df, cause_map):
    """Add acause column and collapse before appending split groups."""

    cause_map = cause_map[['cause_id', 'value']].copy()
    cause_map = cause_map.rename(columns={'value': 'cause'})

    df = df.merge(cause_map, how='left', on='cause')
    report_if_merge_fail(df, 'cause_id', 'cause')

    # there is some confusion during this cause set version between our
    # db server and the engine room as to which is the right cause set
    # version. Resolve that here.
    if CONF.get_id('cause_set_version') == 229:
        # replace any urinary/gyne with genitourinary
        print_log_message("Fixing urinary/gyne causes for csvid 229.")
        urinary_id = 594
        gyne_id = 603
        genitourinary_id = 982
        df.loc[df['cause_id'].isin([gyne_id, urinary_id]),
               'cause_id'] = genitourinary_id

    df = df.drop(['cause', 'split_group'], axis=1)
    df = df.groupby([col for col in df.columns if col != 'freq'],
                    as_index=False).sum()
    return df
def merge_with_detail_map(df, detail_map):
    """Merge incoming data with the detail map."""
    assert 'detail_level' not in df.columns
    detail_map = detail_map[["code_id", "detail_level_id"]]
    df = df.merge(detail_map, on=['code_id'], how='left')
    report_if_merge_fail(df, 'detail_level_id', 'code_id')
    return df
def calculate_cc_code(df, env_meta_df, code_map):

    df_cc = df.copy()

    # groupby everything except cause + code_id
    group_cols = [
        'location_id', 'year_id', 'sex_id', 'age_group_id', 'nid',
        'extract_type_id', 'site_id'
    ]
    df_cc = df_cc.groupby(group_cols, as_index=False).deaths.sum()

    # merge on envelope
    df_cc = add_envelope(df_cc, env_df=env_meta_df)
    df_cc['value'] = 'cc_code'
    df_cc = add_code_metadata(df_cc, ['code_id'],
                              merge_col='value',
                              code_map=code_map)
    report_if_merge_fail(df_cc, ['code_id'], ['value'])
    df_cc['cause_id'] = 919
    df_cc['deaths'] = df_cc['mean_env'] - df_cc['deaths']
    assert df_cc.notnull().values.any()

    # append together
    df = pd.concat([df, df_cc], ignore_index=True)

    assert np.isclose(df['deaths'].sum(), df.mean_env.sum())
    df = df.drop(['mean_env', 'value'], axis=1)

    return df
def calculate_cc_code(df, env_meta_df, code_map):
    """Calculate total deaths denominator.

    Note: This step is usually done in formatting. Moving this calculation
    after age/sex splitting should return more accurate results for data that
    has a mix of known, detailed age groups and unknown ages.
    """
    df_cc = df.copy()

    # groupby everything except cause + code_id
    group_cols = [
        'location_id', 'year_id', 'sex_id', 'age_group_id', 'nid',
        'extract_type_id', 'site_id'
    ]
    df_cc = df_cc.groupby(group_cols, as_index=False).deaths.sum()

    # merge on envelope
    df_cc = add_envelope(df_cc, env_df=env_meta_df)
    df_cc['value'] = 'cc_code'
    df_cc = add_code_metadata(df_cc, ['code_id'],
                              merge_col='value',
                              code_map=code_map)
    report_if_merge_fail(df_cc, ['code_id'], ['value'])
    df_cc['cause_id'] = 919
    df_cc['deaths'] = df_cc['mean_env'] - df_cc['deaths']
    assert df_cc.notnull().values.any()

    # append together
    df = pd.concat([df, df_cc], ignore_index=True)

    assert np.isclose(df['deaths'].sum(), df.mean_env.sum())
    df = df.drop(['mean_env', 'value'], axis=1)

    return df
Exemple #15
0
    def get_computed_dataframe(self, df):

        orig_cols = list(df.columns)
        print_log_message("Starting")

        print_log_message("Reading data")
        agg_cause_ids = list(df['cause_id'].unique())

        print_log_message("Adding garbage envelope")
        garbage_deaths = self.get_redistribution_envelope(df, agg_cause_ids)
        if len(garbage_deaths) > 0:
            gbg_merge_cols = [
                'location_id', 'year_id', 'age_group_id', 'sex_id', 'site_id',
                'cause_id'
            ]
            df = df.merge(garbage_deaths, how='left', on=gbg_merge_cols)
            report_if_merge_fail(df, 'garbage_targeting_cause', gbg_merge_cols)
            report_duplicates(df, gbg_merge_cols)
        else:
            df['garbage_targeting_cause'] = 0

        print_log_message("Adding residual variance")
        residual_variance = self.get_residual_variance()
        resid_merge_cols = ['cause_id', 'age_group_id', 'sex_id']
        df = df.merge(residual_variance, how='left', on=resid_merge_cols)

        df[RD_VAR_COL] = df[RD_VAR_COL]**2

        assert df[RD_VAR_COL].notnull().any()

        if self.has_misdc:

            print_log_message(
                "Getting dismod variance for midsiagnosis corrected causes")
            misdc_variance = self.get_misdiagnosiscorrection_variance()
            df_misdc = df[df['cause_id'].isin(MISDC_CAUSES)]
            df_misdc = df_misdc.merge(misdc_variance,
                                      on=MISDC_MERGE_COLS,
                                      how='left')

            df_misdc[RD_VAR_COL] = df_misdc[MISDC_VAR_COL]
            df_misdc = df_misdc.drop(MISDC_VAR_COL, axis=1)

            df = df.loc[~df['cause_id'].isin(MISDC_CAUSES)]
            df = df.append(df_misdc, ignore_index=True)

        df[RD_VAR_COL] = df[RD_VAR_COL].fillna(0)

        print_log_message("Measuring redistribution variance")
        df = df.apply(self.calculate_redistribution_variance_wrapper, axis=1)
        print_log_message("Done")

        self.diag_df = df.copy()

        draw_cols = ['draw_{}'.format(i) for i in range(0, N_DRAWS)]
        keep_cols = list(orig_cols) + list(draw_cols)
        df = df[keep_cols]
        return df
Exemple #16
0
def map_site_id(df, site_col='site', conn_def='ADDRESS', upload=True):
    """Map site_id to the data given site_col.

    Will upload sites to cod.site if necessary.
    """
    # set cache options
    force_cache_options = {
        'force_rerun': True,
        'block_rerun': False,
        'cache_dir': "standard",
        'cache_results': True,
        'verbose': True
    }

    # named site_name in db
    df = df.rename(columns={site_col: 'site_name'})
    df['site_name'] = df['site_name'].fillna("")

    # get site names in db
    unique_sites = df[['site_name']].drop_duplicates()
    db_sites = get_sites(**force_cache_options)
    # make site_name both lower for merge, since mysql site name is
    # case-insensitive
    # and this is how outliers are stored
    unique_sites['site_name_orig'] = unique_sites['site_name']
    unique_sites['site_name'] = unique_sites['site_name'].str.strip(
    ).str.lower()
    db_sites['site_name'] = db_sites['site_name'].str.strip().str.lower()

    # merge onto ones in df
    unique_sites = unique_sites.merge(db_sites, how='left')
    unique_sites['site_name'] = unique_sites['site_name_orig']
    unique_sites = unique_sites.drop('site_name_orig', axis=1)

    # find missings
    upload_sites = unique_sites[unique_sites['site_id'].isnull()]
    upload_sites = upload_sites[['site_name']].drop_duplicates()
    if len(upload_sites) > 0:
        print("No site_id for sites {}".format(
            upload_sites.site_name.unique()))
        if upload:
            print("Uploading new sites...")
            # if any, upload them
            insert_names('site', upload_sites, conn_def=conn_def)

            # refresh db_sites
            db_sites = get_sites(**force_cache_options)
            unique_sites = unique_sites.drop('site_id', axis=1)
            unique_sites = unique_sites.merge(db_sites, how='left')
            report_if_merge_fail(unique_sites, 'site_id', 'site_name')
        else:
            print("Not uploading new sites, allowing merge to fail...")

    df = df.merge(unique_sites, on='site_name', how='left')
    if upload:
        report_if_merge_fail(unique_sites, 'site_id', 'site_name')

    return df
Exemple #17
0
    def assert_valid_mappings(self, df, code_system_id):
        """Test that the mapping worked.

        Runs a suite of assertions to make sure that mapping was successful.
        Args:
            df (DataFrame): with at least code_id and cause_id
        Returns:
            None
        Raises:
            AssertionError: Any condition fails
        """
        # add code value from cached code map
        print("Adding value")
        df = add_code_metadata(df, ['value'],
                               code_system_id,
                               force_rerun=False,
                               block_rerun=True,
                               cache_dir=self.cache_dir)
        report_if_merge_fail(df, 'value', 'code_id')
        # get acause from cached cause hierarchy
        print("Adding acause")
        df = add_cause_metadata(df, ['acause'],
                                cause_set_version_id=self.cause_set_version_id,
                                force_rerun=False,
                                block_rerun=True,
                                cache_dir=self.cache_dir)
        report_if_merge_fail(df, 'acause', 'cause_id')

        # Test that all causes starting with 'acause_' are mapped correctly.
        # acause_cvd, for example, should be mapped to 'cvd' (not 'cvd_ihd').
        # 'acause__gc_X59' should be mapped to '_gc', etc.
        print("Checking implied acauses")
        check_df = df.loc[df['value'].str.startswith('acause_')]
        check_df['implied_acause'] = \
            check_df['value'].str.replace('acause_', '', 1)

        check_df.loc[check_df['value'].str.contains("acause__gc"),
                     'implied_acause'] = "_gc"
        bad_df = check_df.loc[check_df['acause'] != check_df['implied_acause']]
        if len(bad_df) > 0:
            bad_stuff = bad_df[['value', 'acause']].drop_duplicates()
            raise AssertionError(
                "These code values do not match their acause: "
                "\n{}".format(bad_stuff))

        print("Checking for bad values")
        # assert incorrect acauses are gone
        bad_acauses = [
            'acause_digest_gastrititis', 'acause_hiv_tb', 'acause_tb_drug'
        ]

        bad_df = df.loc[df['value'].isin(bad_acauses)].value.unique()
        if len(bad_df) > 0:
            raise AssertionError(
                "Found these bad code values in the data: {}".format(
                    bad_stuff))
Exemple #18
0
    def get_computed_dataframe(self, df):
        """Replace acauses with those in the bridge map."""
        # VA sources are the only ones where this may not work
        # might need to split dataframe by data_type_id for bridge map
        df = add_nid_metadata(df, ['data_type_id'], **self.cache_options)
        has_verbal_autopsy = self.VA in df['data_type_id'].unique()
        df.drop(columns='data_type_id', inplace=True)

        if self.needs_bridging(has_verbal_autopsy):
            file_name = self.get_file_name(has_verbal_autopsy)
            map_df = pd.read_csv(self.bridge_map_path / file_name)
            map_df = map_df[['acause', 'bridge_code']]

            # add acause column to deaths data
            bridge_mapped = add_cause_metadata(
                df,
                ['acause'],
                merge_col='cause_id',
                cause_meta_df=self.cause_meta_df
            )
            # hack, this cause_id snuck in somehow...
            bridge_mapped.loc[
                bridge_mapped['cause_id'] == 606, 'acause'
            ] = 'gyne_femaleinfert'
            report_if_merge_fail(bridge_mapped, 'acause', 'cause_id')
            bridge_mapped.drop(['cause_id'], axis=1, inplace=True)

            # perform zz bridge code redistribution before other bridge mapping
            bridge_mapped = self.redistribute_zz_bridge_codes(bridge_mapped, map_df)

            bridge_mapped = bridge_mapped.merge(
                map_df, how='left', on='acause'
            )
            bridge_mapped = self.acause_to_bridge_code(bridge_mapped)
            # bring cause_id back
            bridge_mapped = add_cause_metadata(
                bridge_mapped,
                ['cause_id'],
                merge_col='acause',
                cause_meta_df=self.cause_meta_df
            )

            # hack, this cause_id snuck in
            bridge_mapped.loc[
                bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id'
            ] = 606
            report_if_merge_fail(bridge_mapped, 'cause_id', 'acause')
            # output diagnostic dataframe
            self.diag_df = bridge_mapped
            # drop unnecessary columns
            bridge_mapped = self.clean_up(bridge_mapped)
            return bridge_mapped
        else:
            self.diag_df = df
            df = self.clean_up(df)
            return df
def extract_positive_excess(df, id_cols, cause_to_targets_map):
    """Extract positive excess and assign target causes."""
    df = df[id_cols + ['positive_excess']]
    df['target_cause_id'] = df['cause_id'].map(cause_to_targets_map)
    df.loc[df['cause_id'] == 606, 'target_cause_id'] = 294
    # every cause should have a target, even if it is not corrected
    report_if_merge_fail(df, 'target_cause_id', 'cause_id')
    df['cause_id'] = df['target_cause_id']
    df = df.groupby(id_cols, as_index=False)['positive_excess'].sum()
    return df
Exemple #20
0
    def add_in_out_hospital_proportions(self, df, props_df):

        df = df.merge(props_df, how='left')

        df.loc[df['hospdead'] == self.missing_hospdead_id,
               self.in_out_hosp_prop_name] = 1

        report_if_merge_fail(
            df, self.in_out_hosp_prop_name,
            ['location_id', 'age_group_id', 'sex_id', 'strata', 'hospdead'])
        return df
Exemple #21
0
    def add_geo_group_col(self, df):

        self.geo_groups = df[self.geo_cols].drop_duplicates().reset_index()
        self.geo_groups = self.geo_groups.rename(
            columns={'index': 'geo_group'})
        # ensure that index is unique
        assert len(set(self.geo_groups.index)) == len(self.geo_groups)
        df = df.merge(self.geo_groups, on=self.geo_cols, how='left')
        report_if_merge_fail(df, 'geo_group', self.geo_cols)
        df.drop(self.geo_cols, axis=1, inplace=True)
        return df
def add_rd_locations(df, lsvid):
    """Merge on location hierarchy specific to redistribution."""
    lhh = get_current_location_hierarchy(location_set_version_id=lsvid,
                                         force_rerun=False,
                                         block_rerun=True,
                                         cache_dir=CACHE_DIR)
    rd_lhh = get_redistribution_locations(lhh)
    df = pd.merge(df, rd_lhh, on='location_id', how='left')
    report_if_merge_fail(df, 'global', 'location_id')
    report_if_merge_fail(df, 'dev_status', 'location_id')

    return df
def add_reg_location_metadata(df, location_hierarchy):

    df = add_location_metadata(df, ['region_id', 'super_region_id'], location_meta_df=location_hierarchy)
    report_if_merge_fail(df, 'region_id', 'location_id')
    df['region_id'] = df['region_id'].astype(int)
    report_if_merge_fail(df, 'super_region_id', 'location_id')
    df['super_region_id'] = df['super_region_id'].astype(int)
    df = df.rename(columns={
        'super_region_id': 'super_region',
        'region_id': 'region',
        'location_id': 'country'
    })
    df['global'] = 1
    return df
    def make_codem_codviz_metrics(df, pop_df):
        """Use draws to calculate inputs for CODEm and CoDViz."""
        add_cols = [LOWER_RD_COL, UPPER_RD_COL, LOGIT_CF_VAR_COL,
                    LOG_DEATHRATE_VAR_COL]
        for col in add_cols:
            df[col] = np.nan

        if N_DRAWS > 0:
            cf_draw_cols = RedistributionVarianceEstimator.cf_draw_cols

            df = add_population(df, pop_df=pop_df)
            report_if_merge_fail(
                df.query('age_group_id != 27'), 'population',
                ['age_group_id', 'location_id', 'year_id', 'sex_id']
            )

            # get variance for CODEm
            df = df.apply(
                RedistributionVarianceEstimator.calculate_codem_variances,
                cf_draw_cols=cf_draw_cols, axis=1
            )

            # get the upper and lower bounds for CoDViz
            df = df.apply(
                RedistributionVarianceEstimator.calculate_codviz_bounds, axis=1
            )

            # drop draw/diagnostic/pop columns
            df = df.drop(cf_draw_cols + ['population'], axis=1)

        else:
            df[LOWER_RD_COL], df[UPPER_RD_COL] = df['cf_final'], df['cf_final']
            df[LOGIT_CF_VAR_COL], df[LOG_DEATHRATE_VAR_COL] = 0, 0

        # make sure there aren't any null values in the added columns
        check_no_nulls = [
            LOWER_RD_COL, UPPER_RD_COL, MEAN_RD_COL, LOGIT_CF_VAR_COL,
            LOG_DEATHRATE_VAR_COL
        ]
        null_vals = df.loc[
            df[check_no_nulls].isnull().any(axis=1),
            MISDC_MERGE_COLS + check_no_nulls
        ]
        if len(null_vals) > 0:
            raise AssertionError(
                'there are null values in redistribution uncertainty '
                'columns: \n{}'.format(null_vals)
            )

        return df
Exemple #25
0
 def merge_nonzero_mad_info(self, cmdf):
     """Read in the floor input and merge onto main dataframe."""
     # load nonzero floor values
     nonzero_mad = self.compile_nonzero_floor(cmdf)
     nonzero_mad = self.format_nzf(nonzero_mad, cmdf)
     self._check_all_floors_exist(
         nonzero_mad)  # checks that all age_groups/cancer/year/sex exist
     nonzero_mad_cols = self.merge_cols + ['floor']
     nonzero_mad = nonzero_mad[nonzero_mad_cols]
     self.min_possible_val = nonzero_mad['floor'].min()
     self.df = self.df.merge(nonzero_mad, how='left', on=self.merge_cols)
     # ensure no floor values are missing
     assert self.df.floor.isnull().any() == False, "null floor values exist"
     report_if_merge_fail(self.df, 'floor', self.merge_cols)
def prune_cancer_registry_data(df, location_meta_df):

    ukraine_nid_extract = (df['nid'] == 284465) & (df['extract_type_id'] == 53)
    assert (df[ukraine_nid_extract]['location_id'] == 63).all(), \
        "Now ukraine data has more than just ukraine national, and code " \
        "should be changed"
    df.loc[ukraine_nid_extract, 'location_id'] = 50559

    df = add_location_metadata(df, ['most_detailed'],
                               location_meta_df=location_meta_df)
    report_if_merge_fail(df, 'most_detailed', 'location_id')

    df = df.query('most_detailed == 1')
    df = df.drop('most_detailed', axis=1)
    return df
Exemple #27
0
    def add_in_out_hospital_proportions(self, df, props_df):

        df = df.merge(props_df, how='left')
        # in years where data is not disaggregated by in/out of hospital, no
        # hospital weighting will be done
        df.loc[
            df['hospdead'] == self.missing_hospdead_id,
            self.in_out_hosp_prop_name] = 1

        report_if_merge_fail(
            df,
            self.in_out_hosp_prop_name,
            ['location_id', 'age_group_id', 'sex_id', 'strata', 'hospdead']
        )
        return df
Exemple #28
0
    def get_computed_dataframe(self, df):
        """Replace acauses with those in the bridge map."""
        df = add_nid_metadata(df, ['data_type_id'], **self.cache_options)
        has_verbal_autopsy = self.VA in df['data_type_id'].unique()

        if self.needs_bridging(has_verbal_autopsy):
            sheet_name = self.get_sheet_name(has_verbal_autopsy)
            map_df = pd.read_excel(self.bridge_map_path, sheetname=sheet_name)
            map_df = map_df[['acause', 'bridge_code']]

            # add acause column to deaths data
            bridge_mapped = add_cause_metadata(
                df,
                ['acause'],
                merge_col='cause_id',
                cause_meta_df=self.cause_meta_df
            )
            # hack, this cause_id snuck in somehow...
            bridge_mapped.loc[
                bridge_mapped['cause_id'] == 606, 'acause'
            ] = 'gyne_femaleinfert'
            report_if_merge_fail(bridge_mapped, 'acause', 'cause_id')
            bridge_mapped.drop(['cause_id'], axis=1, inplace=True)
            bridge_mapped = bridge_mapped.merge(
                map_df, how='left', on='acause'
            )
            bridge_mapped = self.acause_to_bridge_code(bridge_mapped)
            # bring cause_id back
            bridge_mapped = add_cause_metadata(
                bridge_mapped,
                ['cause_id'],
                merge_col='acause',
                cause_meta_df=self.cause_meta_df
            )

            bridge_mapped.loc[
                bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id'
            ] = 606
            report_if_merge_fail(bridge_mapped, 'cause_id', 'acause')
            # output diagnostic dataframe
            self.diag_df = bridge_mapped
            # drop unnecessary columns
            bridge_mapped = self.clean_up(bridge_mapped)
            return bridge_mapped
        else:
            self.diag_df = df
            df = self.clean_up(df)
            return df
Exemple #29
0
def get_age_group_ids(df):
    # clean the age column
    df['who_age'] = df['who_age'].str.replace('Deaths', '')
    df['who_age'] = df['who_age'].astype(int)

    # can drop Deaths1 and Deaths2, they're just subtotals
    df = df.loc[~(df.who_age.isin([1, 2]))]
    start_len = len(df)

    # load codebooks that are used to map age group id to who ages
    adult_cb = get_adult_age_codebook()
    infant_cb = get_infant_age_codebook()
    adult_cb.rename(columns={
        "frmat": "Frmat",
        'cod_age': 'who_age'
    },
                    inplace=True)
    infant_cb.rename(columns={
        'im_frmat': 'IM_Frmat',
        'cod_age': 'who_age'
    },
                     inplace=True)
    adult_cb = adult_cb[['Frmat', 'who_age', 'age_group_id']]
    infant_cb = infant_cb[['IM_Frmat', 'who_age', 'age_group_id']]

    # subset df to infants and adults
    infant_df = df.loc[df.who_age.isin([91, 92, 93, 94])]
    adult_df = df.loc[df.who_age.isin(range(3, 27))]

    # merge age group_ids on with respective codebooks
    infant_df = infant_df.merge(infant_cb,
                                on=['IM_Frmat', 'who_age'],
                                how='left')
    adult_df = adult_df.merge(adult_cb, on=['Frmat', 'who_age'], how='left')

    # handle the unknown ages
    adult_df.loc[(adult_df.Frmat == 9) & (adult_df.who_age == 26),
                 'age_group_id'] = 283

    df = pd.concat([infant_df, adult_df], ignore_index=True)
    # make sure we didn't add/drop any rows in this process
    assert len(df) == start_len, "You added/dropped rows in age mapping"
    # first we need to make adjustments to rows with zero deaths
    df = fix_rows_with_zero_deaths(df)

    report_if_merge_fail(df, 'age_group_id', ['Frmat', 'IM_Frmat', 'who_age'])
    assert df.age_group_id.notnull().all()
    return df
Exemple #30
0
    def get_computed_dataframe(self, df):
        """Return mapped dataframe."""
        # list of all cause columns
        raw_cause_cols = MCoDMapper.get_code_columns(df)
        df = MCoDMapper.fix_icd_codes(df, raw_cause_cols, self.code_system_id)

        print_log_message("Mapping underlying cause/primary diagnosis")
        cause_map = get_cause_map(code_map_version_id=self.code_map_version_id,
                                  **self.cache_options)
        code_map = MCoDMapper.prep_cause_map(cause_map)
        df['cause_mapped'] = df['cause'].map(code_map)

        print_log_message(
            "Trimming ICD codes and remapping underlying cause/primary diagnosis"
        )
        df = MCoDMapper.trim_and_remap(df, {'cause': 'cause_mapped'}, code_map,
                                       self.code_system_id)
        report_if_merge_fail(df, 'cause_mapped', 'cause')

        # merge on the cause_id for the underlying cause
        df = df.rename(columns={'cause_mapped': 'code_id'})
        df['code_id'] = df['code_id'].astype(int)
        df = add_code_metadata(df,
                               'cause_id',
                               code_map_version_id=self.code_map_version_id,
                               **self.cache_options)
        report_if_merge_fail(df, 'cause_id', 'code_id')

        print_log_message("Mapping chain causes")
        # get the special intermediate cause map
        int_cause_map = self.prep_int_cause_map()
        df = MCoDMapper.map_cause_codes(df, int_cause_map, self.int_cause)

        print_log_message("Trimming ICD codes and remapping chain causes")
        int_cause_cols = [x for x in df.columns if self.int_cause in x]
        int_cause_col_dict = MCoDMapper.prep_raw_mapped_cause_dictionary(
            raw_cause_cols, int_cause_cols)
        df = MCoDMapper.trim_and_remap(df, int_cause_col_dict, int_cause_map,
                                       self.code_system_id)

        print_log_message(
            "Identifying rows with intermediate cause of interest")
        df = self.capture_int_cause(df, int_cause_cols)
        if not self.drop_p2:
            df = self.set_part2_flag(df)

        return df