Python add_location_metadata Examples, cod_prep.downloaders.add_location_metadata Python Examples

Example #1

0

Show file

File: aggregators.py Project: zhouxm4/ihme-modeling

    def aggregate_locations(self):
        """Aggregate sub national location_ids up to the country level."""
        df = add_location_metadata(
            self.df, ['location_id', 'path_to_top_parent', 'level'],
            merge_col='location_id',
            location_meta_df=self.location_meta_df)
        # don't need larger than country level aggregation
        loc_levels = [x for x in df['level'].unique() if df['level'] >= 3]
        loc_levels.sort()
        loc_levels.reverse()
        for level in loc_levels:

            level_df = df[df['level'] == level]
            if len(level_df) > 0:
                # replace the location with the parent_id
                level_df['location_id'] = level_df['parent_id']
                level_df['level'] = df['level'] - 1
                level_df.drop('parent_id', axis=1, inplace=True)

                level_df = add_location_metadata(
                    level_df, ['parent_id'],
                    merge_col=['location_id'],
                    location_meta_df=self.location_meta_df)
                # add in deaths by each level
                df = df.append(level_df, ignore_index=True)
            return df

Example #2

0

Show file

    def get_completeness(self):
        """Retrieve VR completeness.

        VR completeness is not calculated here so that that script can be
        shared with the demographics team
        """
        df = pd.read_csv(self.completeness)
        df = add_location_metadata(df,
                                   'ihme_loc_id',
                                   location_meta_df=self.location_meta_df)

        # the completeness csv has duplicates for some location/years within
        # mainland China for "sample" and "envelope" denominators
        # we want to use "sample" aka DDM for all of mainland China
        is_mainland_china = (df['ihme_loc_id'].str.startswith(
            "CHN")) & ~(df['ihme_loc_id'].isin(["CHN_354", "CHN_361"]))
        is_sample = df['denominator'] == "sample"
        df = df.loc[(is_mainland_china & is_sample) |
                    (~is_mainland_china & ~is_sample)]

        assert not df[['ihme_loc_id', 'year_id']].duplicated().any()

        # set to 1 if it's over 1
        df['comp'] = df['comp'].apply(lambda x: 1 if x > 1 else x)

        df = df[['location_id', 'year_id', 'comp']]

        return df

Example #3

0

Show file

File: formatting.py Project: zhouxm4/ihme-modeling

def check_subnational_locations(df):

    subnational_iso3s = CONF.get_id('subnational_modeled_iso3s')

    if 'UKR' in subnational_iso3s:
        subnational_iso3s.remove('UKR')
    df = add_location_metadata(
        df,
        'ihme_loc_id',
        location_set_version_id=CONF.get_id('location_set_version'))
    nid_etid_pairs = list(df[['nid', 'extract_type_id'
                              ]].drop_duplicates().to_records(index=False))
    nid_etid_pairs = [tuple(pair) for pair in nid_etid_pairs]

    for x in nid_etid_pairs:
        nid = x[0]
        etid = x[1]
        check_loc = list(
            df.loc[(df['nid'] == nid) & (df['extract_type_id'] == etid),
                   'ihme_loc_id'].unique())[0]
        try:
            assert check_loc not in subnational_iso3s
        except:
            print("Found {} where we model subnationally "
                  "for NID {} extract_type_id {}. Setting is_active"
                  " to False".format(check_loc, nid, etid))
            df.loc[(df['nid'] == nid) & (df['extract_type_id'] == etid),
                   'is_active'] = 0
    return df

Example #4

0

Show file

    def prep_stars(self, df):
        """Calculate stars related indicators."""
        print_log_message("Calculating stars")
        stars_df = self.calculate_stars(df)

        # formatting pieces, conforming to previous versions
        stars_df = add_location_metadata(
            stars_df, [
                "ihme_loc_id", "level", "location_ascii_name", "sort_order",
                "parent_id"
            ],
            location_meta_df=self.location_meta_df)
        stars_df = stars_df.rename(
            columns={
                "sort_order": "location_sort_order",
                "level": "location_level",
                "location_ascii_name": "location_name"
            })

        print_log_message("Calculating global stars, because why not")
        world_stars = self.measure_the_world(stars_df)
        stars_df = pd.concat([stars_df, world_stars], ignore_index=True)

        # add parent ihme_loc_id instead of parent_id
        ihme_loc_id_dict = self.location_meta_df.set_index(
            "location_id")["ihme_loc_id"].to_dict()
        stars_df["parent_ihme_loc_id"] = \
            stars_df["parent_id"].map(ihme_loc_id_dict)
        stars_df.drop("parent_id", axis=1, inplace=True)

        assert stars_df.notnull().values.any()

        return stars_df

Example #5

0

Show file

def adjust_representative_id(df):
    """
    Change representative_id to 1 for non-VR.
    """
    if 'ihme_loc_id' not in df.columns:
        df = add_location_metadata(
            df,
            'ihme_loc_id',
            location_set_version_id=CONF.get_id('location_set_version'))
    df['iso3'] = df['ihme_loc_id'].str[0:3]
    override_locs = ['PAK', 'NGA', 'PHL']
    vr_data_types = [9, 10]
    # TO DO
    # this function should be flexible enough to:
    # 1) check if there are VR data for this specific location/year
    # 2) change represenative id to 0 for non-VR if there is VR
    # 3) change representative id to 1 ONLY if there is no VR present
    # 4) adding a print statement/documentation of loc/years changed would be nice
    # today = datetime.date.today()
    # decomp_step_2_end = datetime.date(2019, 5, 20)
    # assert decomp_step_2_end > today, "Fix this method before we add new data!"
    df.loc[(df['iso3'].isin(override_locs)) &
           (df['ihme_loc_id'].str.len()) > 4 &
           (~df['data_type_id'].isin(vr_data_types)), 'representative_id'] = 1

    return df

Example #6

0

Show file

def format_for_nr(df, location_hierarchy):
    """Merge on needed location metadata."""
    locs = df[['location_id']].drop_duplicates()
    locs = add_location_metadata(
        locs,
        add_cols=["ihme_loc_id", "path_to_top_parent"],
        merge_col="location_id",
        location_meta_df=location_hierarchy
    )
    report_if_merge_fail(locs, 'path_to_top_parent', 'location_id')
    locs['country_id'] = locs['path_to_top_parent'].str.split(",").apply(
        lambda x: int(x[3]))
    locs['subnat_id'] = locs['ihme_loc_id'].apply(
        lambda x: int(x.split("_")[1]) if "_" in x else 0)
    locs['iso3'] = locs['ihme_loc_id'].str.slice(0, 3)
    different_locations = locs['country_id'] != locs['location_id']

    locs.loc[different_locations, 'iso3'] = \
        locs['iso3'].apply(lambda x: x + "_subnat")
    locs = locs[['location_id', 'country_id', 'subnat_id', 'iso3']]
    df = df.merge(locs, on='location_id', how='left')
    report_if_merge_fail(locs, 'country_id', 'location_id')

    # create indicator column for running a separate nr model
    # for national and subnational locations
    subnational_modeled_iso3s = CONF.get_id('subnational_modeled_iso3s')
    df['is_loc_agg'] = 0
    df.loc[df['iso3'].isin(subnational_modeled_iso3s), 'is_loc_agg'] = 1

    # remove 0 sample size rows
    df = df.loc[df['sample_size'] > 0]

    return df

Example #7

0

Show file

File: positive_excess_correction.py Project: cheth-rowe/ihmexp

def calculate_positive_excess(df, value_cols, pop_col, loc_meta_df):
    """Calculate the positive excess for flagged observations."""
    # calculate the values after positive excess is removed
    df = add_location_metadata(df,
                               'ihme_loc_id',
                               location_meta_df=loc_meta_df,
                               cache_dir=CONF.get_directory('db_cache'))
    df['iso3'] = df['ihme_loc_id'].apply(lambda x: x[:3])
    df = df.drop('ihme_loc_id', axis=1)
    df.loc[
        df['flagged'] == 1,
        'deaths_post'] = df['baserate_deaths'] * df[pop_col] * df['rrateWLD']
    # calculate the positive excess
    df.loc[df['flagged'] == 1,
           'positive_excess'] = df['deaths'] - df['deaths_post']
    # where positive excess is negative, or flagged is 0, val_post = val
    no_pe = (df['positive_excess'] < 0) | (df['positive_excess'].isnull())
    df.loc[no_pe, 'deaths_post'] = df['deaths']
    df.loc[no_pe, 'positive_excess'] = 0
    df = apply_ussr_tb_proportions(df)
    # Leave tuberculosis over age 60
    # Currently 297 is `tb` and 934 is `tb_other`
    df.loc[(df['cause_id'].isin([297, 934])) & (df['age_group_id'] > 17),
           'positive_excess'] = 0
    df.loc[(df['cause_id'].isin([297, 934])) & (df['age_group_id'] > 17),
           'deaths_post'] = df['deaths']
    return df

Example #8

0

Show file

    def get_env_by_time_window(self):
        """Get an envelope by time window."""
        # keep 1980 onwards
        env = self.env_meta_df.query('year_id >= 1980')

        # add time window
        env['time_window'] = env['year_id'].apply(
            lambda x: self.assign_time_group(x))

        env = env.groupby(['location_id', 'time_window'],
                          as_index=False)['mean_env'].sum()

        # restrict to level 3
        env = add_location_metadata(env,
                                    "level",
                                    location_meta_df=self.location_meta_df)
        env = env.query('level == 3')

        # keep the right columns
        env = env[['location_id', 'time_window', 'mean_env']]

        # append 1980 onwards envelope
        env_all = env.groupby(['location_id'],
                              as_index=False)['mean_env'].sum()
        env_all['time_window'] = self.full_time_series
        env = pd.concat([env_all, env], ignore_index=True)

        return env

Example #9

0

Show file

File: aggregators.py Project: cheth-rowe/ihmexp

 def aggregate_locations(self):
     """Aggregate sub national location_ids up to the country level, and create
     any intermediate aggregates in between
     """
     df = add_location_metadata(self.df, ['parent_id', 'level'],
                                merge_col='location_id',
                                location_meta_df=self.location_meta_df)
     # get the max location level in the data
     max_level = df['level'].max()
     # want to aggregate all levels between national and max level
     # if the entire dataframe is already national, no need to aggregate
     if max_level > 3:
         loc_levels = range(4, max_level + 1)
         loc_levels.reverse()
         for level in loc_levels:
             # basic strategy: start with largest level and aggregate to parent location
             # repeat process with next highest level until reaching national
             level_df = df.loc[df['level'] == level]
             if len(level_df) > 0:
                 # replace the location with the parent_id
                 level_df['location_id'] = level_df['parent_id']
                 level_df['level'] = df['level'] - 1
                 # need to drop and re-add parent_id for next level of aggregation
                 level_df.drop('parent_id', axis=1, inplace=True)
                 level_df = add_location_metadata(
                     level_df, ['parent_id'],
                     merge_col=['location_id'],
                     location_meta_df=self.location_meta_df)
                 # collapse the level_df down, for aggregates, want to set site_id to 2
                 group_cols = [
                     col for col in level_df.columns
                     if col not in self.val_cols
                 ]
                 group_cols.remove('site_id')
                 level_df = level_df.groupby(
                     group_cols, as_index=False)[self.val_cols].sum()
                 level_df['site_id'] = 2
                 # append aggregated deaths to original df
                 df = df.append(level_df, ignore_index=True)
         df.drop(['parent_id', 'level'], axis=1, inplace=True)
     else:
         print("No locations more detailed than national to aggregate")
     return df

Example #10

0

Show file

File: prep_proportions_data.py Project: cheth-rowe/ihmexp

def add_reg_location_metadata(df, location_hierarchy):

    df = add_location_metadata(df, ['region_id', 'super_region_id'], location_meta_df=location_hierarchy)
    report_if_merge_fail(df, 'region_id', 'location_id')
    df['region_id'] = df['region_id'].astype(int)
    report_if_merge_fail(df, 'super_region_id', 'location_id')
    df['super_region_id'] = df['super_region_id'].astype(int)
    df = df.rename(columns={
        'super_region_id': 'super_region',
        'region_id': 'region',
        'location_id': 'country'
    })
    df['global'] = 1
    return df

Example #11

0

Show file

    def prep_vr_indicators(self, df):
        """Prep outputs for VR by location/year.

        This "view" is used for data drops. ALso passed to SDGs.
        """
        is_vr = df['data_type_id'].isin([9, 10])
        keep_cols = [
            'location_id', 'year_id', 'data_type_id', 'nid', 'source',
            'nationally_representative', 'pctgarbage', 'pctagr', 'comp'
        ]
        vr_df = df.loc[is_vr, keep_cols]
        assert not vr_df[['location_id', 'year_id', 'data_type_id'
                          ]].duplicated().any()
        vr_df = add_location_metadata(vr_df,
                                      'ihme_loc_id',
                                      location_meta_df=self.location_meta_df)

        # added this column for the SDGs team
        vr_df['pct_well_cert'] = vr_df['comp'] * (1 - vr_df['pctgarbage'] -
                                                  vr_df['pctagr'])

        # set variable logic for clarity
        sub_rep = vr_df['nationally_representative'] == 0
        nigeria = vr_df['source'] == "Nigeria_VR"
        mccd = vr_df['source'].str.startswith("India_MCCD")
        garbage_over_50 = vr_df['pctgarbage'] > .5
        comp_under_50 = vr_df['comp'] < .5
        comp_over_20 = vr_df['comp'] > .2
        comp_under_70 = (vr_df['comp'] < .7)

        # set drop_status variable based on completeness and garbage
        vr_df['drop_status'] = 0
        vr_df.loc[garbage_over_50 | comp_under_50, 'drop_status'] = 1
        # USERNAME made the decision to stop making an exception for this source (12/19/2018)
        # vr_df.loc[nigeria, 'drop_status'] = 0
        vr_df.loc[mccd & comp_over_20 & ~garbage_over_50, 'drop_status'] = 0
        vr_df.loc[sub_rep & ~garbage_over_50, 'drop_status'] = 0
        # This source has been manually dropped since GBD 2019 decomp step 1 - now
        # adding this drop to the code (7/23/2019)
        vr_df.loc[nigeria, 'drop_status'] = 1

        vr_df['comp_under_70'] = 0
        vr_df.loc[comp_under_70, 'comp_under_70'] = 1

        return vr_df

Example #12

0

Show file

File: calc_percent_to_hiv_for_cod.py Project: cheth-rowe/ihmexp

def make_india_state_aggregates(pafs, locations):
    """
    Make state level aggregates for India. Will
    append these in addition to the national aggregate
    """
    # restrict pafs data to just india urban/rural subnats
    detail_india = locations.loc[
        (locations.ihme_loc_id.str.startswith('IND'))
        & (locations.level == 5)]['location_id'].unique().tolist()
    pafs = pafs.loc[pafs.location_id.isin(detail_india)]

    # add the parent_id and make it the location id
    pafs = add_location_metadata(pafs, 'parent_id', location_meta_df=locations)
    pafs['location_id'] = pafs['parent_id']
    pafs.drop('parent_id', axis=1, inplace=True)

    # collapse the data
    draw_cols = [col for col in pafs.columns if 'draw' in col]
    state_aggs = pafs.groupby(['age_group_id', 'location_id',
                               'year'])[draw_cols].mean().reset_index()
    return state_aggs

Example #13

0

Show file

    def prep_star_level_location_hierarchy(self, df):
        """
        Output a file used by central comp to populate star based location
        hieraerchy. This file contains only most detailed locations. Each location
        has a parent_id (either 1 or 0) indicating data rich or data sparse
        """
        # add information for most detailed
        df = add_location_metadata(df, 'most_detailed')
        full_time = (df['time_window'] == self.full_time_series)
        most_detailed = (df['most_detailed'] == 1)

        # limit to most detailed locs and the full time series
        df = df.loc[full_time & most_detailed]
        # add the parent_ids
        df.loc[df.stars.isin([4, 5]), 'parent_id'] = 1
        df.loc[df.stars.isin([0, 1, 2, 3]), 'parent_id'] = 0
        assert df.parent_id.notnull().all()

        df = df[[
            'ihme_loc_id', 'location_id', 'parent_id', 'time_window', 'stars'
        ]]
        return df

Example #14

0

Show file

File: misdiagnosis_correction.py Project: cheth-rowe/ihmexp

 def decomp_one_remap_location_id(self, loc_ids):
     """For decomp step one, there is no dismod data for new locations
     or new subnationals. For new subnationals, we will use dismod data for
     the subnational's parent. For new locations we will use dismod data
     from a neighboring region country. This function/step should be removed
     in step two (12/17/2018)"""
     df = pd.DataFrame({'location_id': loc_ids})
     df = add_location_metadata(
         df,
         'parent_id',
         location_set_version_id=self.conf.get_id('location_set_version'),
         **self.block_rerun)
     report_if_merge_fail(df, 'parent_id', 'location_id')
     # make subnational location the parent
     df.loc[df.parent_id.isin([16, 51, 86, 165, 214]),
            'location_id'] = df['parent_id']
     # remap Saint Kitts to Dominican Repuclic
     df.loc[df.location_id == 393, 'location_id'] = 111
     # remap Monaco and San Marino to Italy
     df.loc[df.location_id.isin([367, 396]), 'location_id'] = 86
     # remap Palau and Cook Islands to Papua New Guinea
     df.loc[df.location_id.isin([320, 380]), 'location_id'] = 26
     return df.location_id.unique().tolist()

Example #15

0

Show file

def check_subnational_locations(df):
    """Check that there are no national observations where we model subnationally.

    Use the list of subnationally modeled countries to ensure that there are no
    national observations. This will raise an assertion error, and the user
    should either re extract subnational information from the original data
    or accept that it will remain inactive.
    """
    subnational_iso3s = CONF.get_id('subnational_modeled_iso3s')
    # exceptions for Ukraine which is location split in aggregation
    # remove Ukraine-- this is a special case where we're splitting
    # the national Ukraine data to Ukraine without crimea/sevastopol
    # later the in the prep process (see count_adjustments.py)
    if "UKR" in subnational_iso3s:
        subnational_iso3s.remove('UKR')
    df = add_location_metadata(
        df,
        'ihme_loc_id',
        location_set_version_id=CONF.get_id('location_set_version'))
    nid_etid_pairs = list(df[['nid', 'extract_type_id'
                              ]].drop_duplicates().to_records(index=False))
    nid_etid_pairs = [tuple(pair) for pair in nid_etid_pairs]

    for x in nid_etid_pairs:
        nid = x[0]
        etid = x[1]
        check_loc = list(
            df.loc[(df['nid'] == nid) & (df['extract_type_id'] == etid),
                   'ihme_loc_id'].unique())[0]
        if check_loc in subnational_iso3s:
            print("Found {} where we model subnationally "
                  "for NID {} extract_type_id {}. Setting is_active"
                  " to False".format(check_loc, nid, etid))
            df.loc[(df['nid'] == nid) & (df['extract_type_id'] == etid),
                   'is_active'] = 0
    return df

Example #16

0

Show file

File: positive_excess_correction.py Project: zhouxm4/ihme-modeling

def calculate_positive_excess(df, value_cols, pop_col, loc_meta_df):

    df = add_location_metadata(df,
                               'ihme_loc_id',
                               location_meta_df=loc_meta_df,
                               cache_dir=CONF.get_directory('db_cache'))
    df['iso3'] = df['ihme_loc_id'].apply(lambda x: x[:3])
    df = df.drop('ihme_loc_id', axis=1)
    df.loc[
        df['flagged'] == 1,
        'deaths_post'] = df['baserate_deaths'] * df[pop_col] * df['rrateWLD']
    # calculate the positive excess
    df.loc[df['flagged'] == 1,
           'positive_excess'] = df['deaths'] - df['deaths_post']
    no_pe = (df['positive_excess'] < 0) | (df['positive_excess'].isnull())
    df.loc[no_pe, 'deaths_post'] = df['deaths']
    df.loc[no_pe, 'positive_excess'] = 0
    df = apply_ussr_tb_proportions(df)

    df.loc[(df['cause_id'].isin([297, 934])) & (df['age_group_id'] > 17),
           'positive_excess'] = 0
    df.loc[(df['cause_id'].isin([297, 934])) & (df['age_group_id'] > 17),
           'deaths_post'] = df['deaths']
    return df

Example #17

0

Show file

File: formatting.py Project: zhouxm4/ihme-modeling

def map_extract_type_id(df, source, extract_type, conn_def='ADDRESS'):
    nid_extract_types = {}
    start_extract_type = extract_type
    for nid in df.nid.unique():
        if source == "China_DSP_prov_ICD10":
            extract_type = "admin1: DSP sites only"
        elif start_extract_type is None:
            extract_type = None
            extract_type = induce_extraction_type(df.loc[df['nid'] == nid],
                                                  conn_def=conn_def)
        else:
            extract_type = extract_type

        extract_type = str(extract_type).strip()
        # Determine (or create) the extract_type_id
        extract_type_id = pull_or_create_extract_type(extract_type,
                                                      conn_def=conn_def)
        assert extract_type_id is not None, "Weird fail"
        nid_extract_types[nid] = extract_type_id

    df['extract_type_id'] = df['nid'].map(nid_extract_types)
    report_if_merge_fail(df, 'extract_type_id', 'nid')

    if (df['nid'].isin(SPLIT_ISO_NIDS).any()):

        nid_loc_df = df.loc[df['nid'].isin(SPLIT_ISO_NIDS),
                            ['nid', 'location_id']].drop_duplicates()
        nid_loc_df = add_location_metadata(
            nid_loc_df,
            'ihme_loc_id',
            location_set_version_id=CONF.get_id('location_set_version'))
        nid_loc_df['iso3'] = nid_loc_df['ihme_loc_id'].str.slice(0, 3)

        nid_loc_df['extract_type'] = nid_loc_df.apply(
            lambda x: "{nid}: {iso3} data".format(nid=x['nid'], iso3=x['iso3']
                                                  ),
            axis=1)
        extract_types = nid_loc_df[['extract_type']].drop_duplicates()
        extract_types['extract_type_id_new'] = \
            extract_types['extract_type'].apply(
                lambda x: pull_or_create_extract_type(x, conn_def=conn_def))
        nid_loc_df = nid_loc_df.merge(extract_types,
                                      on='extract_type',
                                      how='left')
        report_if_merge_fail(nid_loc_df, 'extract_type_id_new', 'extract_type')
        df = df.merge(nid_loc_df, on=['nid', 'location_id'], how='left')
        report_if_merge_fail(df.loc[df['nid'].isin(SPLIT_ISO_NIDS)],
                             'extract_type_id_new', ['nid', 'location_id'])
        df.loc[df['nid'].isin(SPLIT_ISO_NIDS),
               'extract_type_id'] = df['extract_type_id_new']
        df = df.drop(
            ['extract_type_id_new', 'iso3', 'ihme_loc_id', 'extract_type'],
            axis=1)

    if source == "VA_lit_GBD_2010_2013":

        code_system_to_extract_type_id = {
            244: 350,
            273: 347,
            294: 353,
            296: 356,
        }
        df.loc[df['nid'].isin([93570, 93664]),
               'extract_type_id'] = df['code_system_id'].map(
                   code_system_to_extract_type_id)

    # police data fixes
    elif source == "Various_RTI":
        code_system_to_extract_type_id = {
            237: 719,
            241: 722,
        }
        df.loc[df['nid'].isin([93599]),
               'extract_type_id'] = df['code_system_id'].map(
                   code_system_to_extract_type_id)

    elif source == 'Matlab_1963_1981':
        df.loc[df['nid'] == 935, 'extract_type_id'] = 380
    elif source == 'Matlab_1982_1986':
        df.loc[df['nid'] == 935, 'extract_type_id'] = 383
    elif source == 'Matlab_1987_2002':
        df.loc[df['nid'] == 935, 'extract_type_id'] = 371
    elif source == 'Matlab_2003_2006':
        df.loc[df['nid'] == 935, 'extract_type_id'] = 374
    elif source == 'Matlab_2007_2012':
        df.loc[df['nid'] == 935, 'extract_type_id'] = 377

    elif source == "Combined_Census":
        df.loc[df['nid'] == 7942, 'extract_type_id'] = 479
    elif source == "Maternal_Census":
        df.loc[df['nid'] == 7942, 'extract_type_id'] = 482
        df.loc[df['nid'] == 10319, 'extract_type_id'] = 485
    elif source == "Other_Maternal":
        df.loc[df['nid'] == 10319, 'extract_type_id'] = 488
    elif source == "Pakistan_maternal_DHS_2006":
        df["extract_type_id"] = 539
    elif source == "Pakistan_child_DHS_2006":
        df["extract_type_id"] = 536

    if (df['data_type_id'].isin([5, 6, 7]).any()):
        csetid = {
            49: 440,
            50: 443,
            52: 386,
            162: 446,
            163: 449,
            155: 455,
            148: 458,
            149: 461,
            150: 464,
            156: 467,
            577: 470
        }
        df.loc[df['nid'].isin([24134, 108611, 125702, 132803, 154012]),
               'extract_type_id'] = df['code_system_id'].map(csetid)
        data_type_dict = {5: 473, 7: 476}
        df.loc[df['code_system_id'] == 154,
               'extract_type_id'] = df['data_type_id'].map(data_type_dict)
    return df

Example #18

0

Show file

File: formatting.py Project: zhouxm4/ihme-modeling

def finalize_formatting(df,
                        source,
                        write=False,
                        code_system_id=None,
                        extract_type=None,
                        conn_def='ADDRESS',
                        is_active=True):

    NID_META_COLS = [
        'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id',
        'code_system_id', 'is_active'
    ]
    NID_LOCATION_YEAR_COLS = [
        'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id'
    ]
    FORMATTED_ID_COLS = [
        'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id',
        'age_group_id', 'location_id'
    ]
    if 'code_id' in df.columns:
        code_col = 'code_id'
        map_code_id = False
    elif 'cause' in df.columns:
        code_col = 'cause'
        map_code_id = True
    else:
        raise AssertionError("Need either 'code_id' or 'cause' in columns")
    INCOMING_EXPECTED_ID_COLS = [
        'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col,
        'site', 'data_type_id', 'representative_id', 'code_system_id'
    ]
    VALUE_COLS = ['deaths']
    FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS

    missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns)
    if len(missing_cols) > 0:
        raise AssertionError(
            """These columns are needed for formatting but not found in df:

            {}
            """.format(missing_cols))

    # SET FORMATTING TIMESTAMP
    format_timestamp = cod_timestamp()
    print("Finalizing formatting with timestamp {}".format(format_timestamp))

    # ADD SOURCE
    df['source'] = source

    # MAP OR CHECK CODE ID
    code_system_ids = df.code_system_id.unique()
    if map_code_id:
        cs_dfs = []
        for code_system_id in code_system_ids:
            cs_df = df.loc[df['code_system_id'] == code_system_id].copy()
            # map code_id to the data
            cs_df['value'] = cs_df['cause']
            cs_df = add_code_metadata(cs_df, ['code_id'],
                                      code_system_id=code_system_id,
                                      merge_col='value',
                                      force_rerun=True,
                                      cache_dir='standard')
            print(cs_df.loc[cs_df['code_id'].isnull()].value.unique())
            report_if_merge_fail(cs_df, ['code_id'], ['value'])
            cs_df = cs_df.drop('value', axis=1)
            cs_dfs.append(cs_df)
        df = pd.concat(cs_dfs, ignore_index=True)
    else:
        # ADD TEST TO CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE
        # CODE SYSTEM
        all_codes_q = """
            SELECT code_id
            FROM ADDRESS
            WHERE code_system_id IN ({})
        """.format(",".join([str(c) for c in code_system_ids]))
        all_codes = ezfuncs.query(all_codes_q, conn_def='engine')
        bad_codes = set(df.code_id) - set(all_codes.code_id)
        if len(bad_codes) > 0:
            print("Found these code ids in data that can't exist in code "
                  "systems {}: {}".format(code_system_ids, bad_codes))

    # MAP SITE ID
    df = map_site_id(df, conn_def=conn_def)
    # MAP EXTRACT TYPE ID
    df = map_extract_type_id(df, source, extract_type, conn_def=conn_def)

    # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS
    df = group_six_minor_territories(df, sum_cols=VALUE_COLS)

    df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))]
    df = df.loc[~(df['nid'].isin([24143, 107307]))]

    # ENSURE NO NEGATIVES
    for val_col in VALUE_COLS:
        assert (df[val_col] >= 0).all(), \
            "there are negative values in {}".format(val_col)

    input_df = df[FINAL_FORMATED_COLS].copy()
    assert not input_df.isnull().values.any(), "null values in df"
    dupped = input_df[input_df.duplicated()]
    if len(dupped) > 0:
        raise AssertionError("duplicate values in df: \n{}".format(dupped))

    # GROUP IF NECESSARY
    if input_df[FORMATTED_ID_COLS].duplicated().any():
        input_df = input_df.groupby(FORMATTED_ID_COLS,
                                    as_index=False)[VALUE_COLS].sum()

    # MAKE NID METADATA TABLE
    if 'parent_nid' not in df.columns:
        df['parent_nid'] = np.nan

    df['is_active'] = 1 * is_active

    # CHECK SUBNATIONAL LOCATIONS
    # alters is_active if needed
    df = check_subnational_locations(df)

    nid_meta_df = df[NID_META_COLS].drop_duplicates()
    nid_meta_df['last_updated_timestamp'] = format_timestamp

    # MAKE NID LOCATION YEAR TABLE
    nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates()
    nid_locyears['last_updated_timestamp'] = format_timestamp
    # check one iso3 per nid
    nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id')
    nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3)
    report_duplicates(
        nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(),
        ['nid', 'extract_type_id'])
    nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1)

    if write:
        # write nid metadata
        write_to_claude_nid_table(nid_meta_df,
                                  'claude_nid_metadata',
                                  replace=True,
                                  conn_def=conn_def)

        # write nid location-year map
        write_to_claude_nid_table(nid_locyears,
                                  'claude_nid_location_year',
                                  replace=True,
                                  conn_def=conn_def)

        insert_source_id(source)

        nid_extracts = input_df[['nid', 'extract_type_id'
                                 ]].drop_duplicates().to_records(index=False)
        for nid, extract_type_id in nid_extracts:
            nid = int(nid)
            extract_type_id = int(extract_type_id)
            print("Writing nid {}, extract_type_id {}".format(
                nid, extract_type_id))
            idf = input_df.loc[(input_df['nid'] == nid) & (
                input_df['extract_type_id'] == extract_type_id)].copy()
            phase = 'formatted'
            launch_set_id = format_timestamp
            print("\nTotal deaths: {}".format(idf.deaths.sum()))
            write_phase_output(idf, phase, nid, extract_type_id, launch_set_id)

        # now refresh cache files for nid
        print("\nRefreshing claude nid metadata cache files")
        force_cache_options = {
            'force_rerun': True,
            'block_rerun': False,
            'cache_dir': "standard",
            'cache_results': True,
            'verbose': True
        }
        get_nid_metadata(**force_cache_options)
        get_nidlocyear_map(**force_cache_options)

    return locals()

Example #19

0

Show file

File: prep_proportions_data.py Project: cheth-rowe/ihmexp

def prepare_square_df(df, location_hierarchy, reg_spec):
    locations = location_hierarchy.query('level == 3')[['location_id']].drop_duplicates()
    ages = df[['age']].drop_duplicates()
    sexes = df[['sex_id']].drop_duplicates()
    targets = df[['cause_id']].drop_duplicates()
    years = df[['year_id']].drop_duplicates()

    join_dfs = [locations, years, ages, sexes, targets]
    square_df = join_dfs[0]
    square_df['join_key'] = 1
    for join_df in join_dfs[1:]:
        join_df['join_key'] = 1
        square_df = square_df.merge(join_df, on='join_key', how='left')
    square_df = square_df.drop('join_key', axis=1)

    square_df = square_df.rename(columns={'location_id': 'country'})

    # will predict based on prop unspecified, but within 15-year age groups
    unspec_props_id_cols = ['country', 'year_id', 'age', 'sex_id']
    unspec_props = df[
        unspec_props_id_cols + ['prop_garbage_bigage']
    ].drop_duplicates()
    report_duplicates(unspec_props, unspec_props_id_cols)

    # fill with age-sex median garbage unspecified where we don't have the
    # location-year
    median_unspec_age_sex = unspec_props.groupby(
        ['age', 'sex_id'], as_index=False
    )['prop_garbage_bigage'].median()
    median_unspec_age_sex = median_unspec_age_sex.rename(
        columns={'prop_garbage_bigage': 'agesex_median_prop_garbage'}
    )

    # add proportion unspecified by location-year where possible
    square_df = square_df.merge(
        unspec_props, on=unspec_props_id_cols, how='left'
    )
    # use age-sex where necessary
    square_df = square_df.merge(
        median_unspec_age_sex, on=['age', 'sex_id'], how='left'
    )

    # this should never have missings
    report_if_merge_fail(square_df, 'agesex_median_prop_garbage', ['age', 'sex_id'])

    square_df['prop_garbage'] = square_df['prop_garbage_bigage'].fillna(
        square_df['agesex_median_prop_garbage']
    )

    # create all-year proportion garbage column
    square_df['prop_garbage_all_years'] = square_df.groupby(
        ['country', 'age', 'sex_id', 'cause_id']
    )['prop_garbage'].transform(np.mean)

    # create year window proportion garbage col
    square_df = add_year_window(square_df)
    square_df['prop_garbage_year_window'] = square_df.groupby(
        ['country', 'age', 'sex_id', 'cause_id', 'year_window']
    )['prop_garbage'].transform(np.mean)

    square_df = square_df.rename(columns={'country': 'location_id'})
    square_df = add_location_metadata(
        square_df, ['region_id', 'super_region_id'],
        location_meta_df=location_hierarchy
    )

    square_df = square_df.rename(columns={
        'super_region_id': 'super_region',
        'region_id': 'region',
        'location_id': 'country',
    })
    square_df['super_region'] = square_df['super_region'].astype(int)
    square_df['region'] = square_df['region'].astype(int)
    square_df['global'] = 1
    square_df = square_df[
        ['global', 'super_region', 'region', 'country', 'year_id',
         'year_window', 'age', 'sex_id', 'cause_id',
         'prop_garbage', 'prop_garbage_all_years', 'prop_garbage_year_window']
    ]
    square_df = add_model_group(square_df)
    square_df = square_df.sort_values(
        by=['global', 'super_region', 'region', 'country', 'year_id', 'age', 'sex_id', 'cause_id']
    )

    if reg_spec['name_short'] == "unspec_diab":
        # haqi
        print_log_message("Adding haqi")
        square_df = add_covariate(square_df, 1099, "haqi", by_sex=False)
        # age-standardized diabetes prevalence
        print_log_message("Adding age-standardized diabetes prevalence")
        square_df = add_covariate(square_df, 29, "diabetes_prev")
        # under-15 age-standardized death rate due to diabetes
        print_log_message("Adding asdr due to diabetes under 15")
        square_df = add_covariate(square_df, 1249, "asdr_dm_015")
        # age-specific obesity
        print_log_message("Adding age-standardized prevalence of obesity")
        square_df = add_covariate(square_df, 455, "prev_obesity_agestd")
        # age-specific type-1 diabetes prevalence
        print_log_message("Adding type 1 diabetes prevalence")
        square_df = add_covariate(square_df, -4, "prev_type1")

    if reg_spec['name_short'] == "unspec_stroke":
        # cholesterol
        print_log_message("Adding cholesterol")
        square_df = add_covariate(square_df, 69, "cholesterol")
        # blood pressure
        print_log_message("Adding blood pressure")
        square_df = add_covariate(square_df, 70, "blood_pressure")
        # age-specific LDL
        print_log_message("Adding age-specific LDL")
        square_df = add_covariate(square_df, -1, "ldl_age_specific")
        # age-specific blood pressure
        print_log_message("Adding age-specific blood pressure")
        square_df = add_covariate(square_df, -2, "blood_pressure_age_specific")
        # stroke 28 day survivorship
        print_log_message("Adding percent survivor")
        square_df = add_covariate(square_df, -3, "percent_survivor")
        # stroke percent incidence by target group
        print_log_message("Adding percent incidence")
        square_df = add_covariate(square_df, -5, "percent_incidence")
    if "cancer" in reg_spec["name_short"]:
        # haqi
        print_log_message('Adding HAQI')
        square_df = add_covariate(square_df, 1099, "haqi", by_sex=False)
        # tobacco (cigarettes per capita)
        print_log_message('Adding cigarettes per capita')
        square_df = add_covariate(square_df, 19, "cigarettes_pc", by_sex=False)
        print_log_message('Adding SDI')
        square_df = add_covariate(square_df, 881, "sdi", by_sex=False)

    return square_df

Example #20

0

Show file

def map_extract_type_id(df, source, extract_type, conn_def='ADDRESS'):
    nid_extract_types = {}
    start_extract_type = extract_type
    for nid in df.nid.unique():
        if source == "China_DSP_prov_ICD10":
            extract_type = "admin1: DSP sites only"
        elif start_extract_type is None:
            extract_type = None
            extract_type = induce_extraction_type(df.loc[df['nid'] == nid],
                                                  conn_def=conn_def)
        else:
            extract_type = extract_type

        extract_type = str(extract_type).strip()
        # Determine (or create) the extract_type_id
        extract_type_id = pull_or_create_extract_type(extract_type,
                                                      conn_def=conn_def)
        assert extract_type_id is not None, "Weird fail"
        nid_extract_types[nid] = extract_type_id

    df['extract_type_id'] = df['nid'].map(nid_extract_types)
    report_if_merge_fail(df, 'extract_type_id', 'nid')

    split_iso_nids = list(
        pd.read_csv(CONF.get_resource('split_iso_nids'))['nid'].unique())
    if (df['nid'].isin(split_iso_nids).any()):
        nid_loc_df = df.loc[df['nid'].isin(split_iso_nids),
                            ['nid', 'location_id']].drop_duplicates()
        nid_loc_df = add_location_metadata(
            nid_loc_df,
            'ihme_loc_id',
            location_set_version_id=CONF.get_id('location_set_version'))
        nid_loc_df['iso3'] = nid_loc_df['ihme_loc_id'].str.slice(0, 3)

        nid_loc_df['extract_type'] = nid_loc_df.apply(
            lambda x: "{nid}: {iso3} data".format(nid=x['nid'], iso3=x['iso3']
                                                  ),
            axis=1)
        extract_types = nid_loc_df[['extract_type']].drop_duplicates()
        extract_types['extract_type_id_new'] = \
            extract_types['extract_type'].apply(
                lambda x: pull_or_create_extract_type(x, conn_def=conn_def))
        nid_loc_df = nid_loc_df.merge(extract_types,
                                      on='extract_type',
                                      how='left')
        report_if_merge_fail(nid_loc_df, 'extract_type_id_new', 'extract_type')
        df = df.merge(nid_loc_df, on=['nid', 'location_id'], how='left')
        report_if_merge_fail(df.loc[df['nid'].isin(split_iso_nids)],
                             'extract_type_id_new', ['nid', 'location_id'])
        df.loc[df['nid'].isin(split_iso_nids),
               'extract_type_id'] = df['extract_type_id_new']
        df = df.drop(
            ['extract_type_id_new', 'iso3', 'ihme_loc_id', 'extract_type'],
            axis=1)

    if source == "VA_lit_GBD_2010_2013":
        # nids need extract types by code system
        # these have already been inserted to extract type table
        code_system_to_extract_type_id = {
            # DRC nationwide 2004
            244: 350,
            # DRC EastWest 2002
            273: 347,
            # Halder (30)
            294: 353,
            # Moran (26)
            296: 356,
        }
        df.loc[df['nid'].isin([93570, 93664]),
               'extract_type_id'] = df['code_system_id'].map(
                   code_system_to_extract_type_id)

    # police data fixes
    elif source == "Various_RTI":
        code_system_to_extract_type_id = {
            # Economic Commission for Europe
            237: 719,
            # Turkey Traffic Accidents
            241: 722,
        }
        df.loc[df['nid'].isin([93599]),
               'extract_type_id'] = df['code_system_id'].map(
                   code_system_to_extract_type_id)

    # distinguish between Matlab sources since all are NID 935
    # these etids have already been inserted into the claude_extract_type
    elif source == 'Matlab_1963_1981':
        df.loc[df['nid'] == 935, 'extract_type_id'] = 380
    elif source == 'Matlab_1982_1986':
        df.loc[df['nid'] == 935, 'extract_type_id'] = 383
    elif source == 'Matlab_1987_2002':
        df.loc[df['nid'] == 935, 'extract_type_id'] = 371
    elif source == 'Matlab_2003_2006':
        df.loc[df['nid'] == 935, 'extract_type_id'] = 374
    elif source == 'Matlab_2007_2012':
        df.loc[df['nid'] == 935, 'extract_type_id'] = 377

    # fixes for "maternal" data
    elif source == "Combined_Census":
        df.loc[df['nid'] == 7942, 'extract_type_id'] = 479
    elif source == "Maternal_Census":
        df.loc[df['nid'] == 7942, 'extract_type_id'] = 482
        df.loc[df['nid'] == 10319, 'extract_type_id'] = 485
    elif source == "Other_Maternal":
        df.loc[df['nid'] == 10319, 'extract_type_id'] = 488
    # these two Pakistan_DHS sources have the same NID
    elif source == "Pakistan_maternal_DHS_2006":
        df["extract_type_id"] = 539
    elif source == "Pakistan_child_DHS_2006":
        df["extract_type_id"] = 536

    # age-specific maps in nid 413394 - need different extracts
    # for different code systems
    elif source == 'VA_lit_GBD_2019':
        code_system_to_extract_type_id = {711: 1745, 712: 1748}
        df.loc[df['nid'] == 413394, 'extract_type_id'] =\
            df['code_system_id'].map(code_system_to_extract_type_id)
        assert df.extract_type_id.notnull().all()

    if (df['data_type_id'].isin([5, 6, 7]).any()):
        # these have already been inserted to extract type table
        csetid = {
            49: 440,
            50: 443,
            52: 386,
            162: 446,
            163: 449,
            155: 455,
            148: 458,
            149: 461,
            150: 464,
            156: 467,
            577: 470
        }
        df.loc[df['nid'].isin([24134, 108611, 125702, 132803, 154012]),
               'extract_type_id'] = df['code_system_id'].map(csetid)
        data_type_dict = {5: 473, 7: 476}
        df.loc[df['code_system_id'] == 154,
               'extract_type_id'] = df['data_type_id'].map(data_type_dict)
    return df

Example #21

0

Show file

def finalize_formatting(df,
                        source,
                        write=False,
                        code_system_id=None,
                        extract_type=None,
                        conn_def='ADDRESS',
                        is_active=False,
                        refresh_cache=True,
                        check_ages=True):
    """Finalize the formatting of the source and optionally write it out.

    Decides whether to map code_id based on whether code_id is already a
        column in the dataset.

    Needs the following information from either the df values or from the
        nid_meta_vals dict:

            data_type_id
            representative_id

        All of the above must have only one value per nid in df.

    Maps site_id to the data based on incoming 'site' column. Will upload
        any sites that are not in the cod.site table already.

    Arguments:
        df, pandas.DataFrame: The dataframe with near-formatted data
        source, str: The source this df is (should be the whole source and
            nothing but the source). Will break if there is no source in
            FILEPATH with this name, and you should pass the
            source without a leading underscore even if it is that way
            in J
        write, bool: whether to write the outputs
        extract_type, str: The manner in which the nid was extracted. If
            left as None, will be induced by the location_type_id of
            the location_id with the maximum level in the dataset. This should
            be over-ridden in cases like China DSP, where the same locations
            are used in two extraction types - "DSP + VR" and "DSP"; China DSP
            then gets two extraction types: "admin1" and
            "admin1: DSP sites only" (in the particular instance of DSP,
            extract type is built into this code. Feel free to add other
            source-extract type mappings here to force consistency.)
        check_ages, bool: Whether or not to enforce age group checks such as
            ensuring no overlaps or gaps. This can be turned off because sometimes
            raw data reports overlapping age groups (e.g. Palestine data has Gaza Strip and West
            Bank data with different age groupings).

    Returns:
        Every local value to the function
        Why? There are multiple df outputs, and formatting is a very
        engaged process so its helpful to just see everything sometimes
    """
    # set column groups, and verify that we have everything we need
    NID_META_COLS = [
        'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id',
        'code_system_id', 'is_active', 'is_mort_active'
    ]
    NID_LOCATION_YEAR_COLS = [
        'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id'
    ]
    FORMATTED_ID_COLS = [
        'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id',
        'age_group_id', 'location_id'
    ]
    if 'code_id' in df.columns:
        code_col = 'code_id'
        map_code_id = False
    elif 'cause' in df.columns:
        code_col = 'cause'
        map_code_id = True
    else:
        raise AssertionError("Need either 'code_id' or 'cause' in columns")
    INCOMING_EXPECTED_ID_COLS = [
        'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col,
        'site', 'data_type_id', 'representative_id', 'code_system_id'
    ]
    VALUE_COLS = ['deaths']
    FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS

    missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns)
    assert len(missing_cols) == 0, \
        "Required formatting columns not found in df: \n{}".format(missing_cols)

    # SET FORMATTING TIMESTAMP
    format_timestamp = cod_timestamp()
    print("Finalizing formatting with timestamp {}".format(format_timestamp))

    # ADD SOURCE
    df['source'] = source

    # MAP OR CHECK CODE ID
    code_system_ids = df.code_system_id.unique()
    if map_code_id:
        cs_dfs = []
        for code_system_id in code_system_ids:
            cs_df = df.loc[df['code_system_id'] == code_system_id].copy()
            # map code_id to the data
            cs_df['value'] = cs_df['cause']
            cs_df = add_code_metadata(cs_df, ['code_id'],
                                      code_system_id=code_system_id,
                                      merge_col='value',
                                      force_rerun=True,
                                      cache_dir='standard')
            report_if_merge_fail(cs_df, ['code_id'], ['value'])
            cs_df = cs_df.drop('value', axis=1)
            cs_dfs.append(cs_df)
        df = pd.concat(cs_dfs, ignore_index=True)
    else:
        # CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE CODE SYSTEM
        all_codes_q = """
            SELECT code_id
            FROM engine_room.maps_code
            WHERE code_system_id IN ({})
        """.format(",".join([str(c) for c in code_system_ids]))
        all_codes = ezfuncs.query(all_codes_q, conn_def='ADDRESS')
        bad_codes = set(df.code_id) - set(all_codes.code_id)
        assert len(bad_codes) == 0, "Found code ids in data that can't exist in code "\
                                    "systems {}: {}".format(code_system_ids, bad_codes)
    check_vr_raw_causes(df)

    # MAP SITE ID
    df = map_site_id(df, conn_def=conn_def)
    # MAP EXTRACT TYPE ID
    df = map_extract_type_id(df, source, extract_type, conn_def=conn_def)
    # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS
    df = group_six_minor_territories(df, sum_cols=VALUE_COLS)

    # sorry for putting this here
    # drop these loc/years b/c env < deaths creating negative cc_code
    # maybe re run w/ another envelope?
    df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))]
    df = df.loc[~(df['nid'].isin([24143, 107307]))]

    # ENSURE NO NEGATIVES
    for val_col in VALUE_COLS:
        assert (df[val_col] >= 0).all(), \
            "there are negative values in {}".format(val_col)

    ################################################
    # keep all 0s now, messing up for NR in non-VR
    # df['val_sum_tmp'] = df[VALUE_COLS].sum(axis=1)
    # all-cause extractions want to keep zeroes
    # keep_zeroes = df['extract_type_id'] == ALL_CAUSE_EXTRACT_ID
    # otherwise, drop them
    # greater_than_zero = df['val_sum_tmp'] > 0
    # df = df[greater_than_zero | keep_zeroes]
    # df = df.drop('val_sum_tmp', axis=1)
    ################################################

    # CHECKS FOR FORMATTED PHASE OUTPUT
    input_df = df[FINAL_FORMATED_COLS].copy()
    assert not input_df.isnull().values.any(), "null values in df"
    dupped = input_df[input_df.duplicated()]
    if len(dupped) > 0:
        raise AssertionError("duplicate values in df: \n{}".format(dupped))

    # GROUP IF NECESSARY
    if input_df[FORMATTED_ID_COLS].duplicated().any():
        input_df = input_df.groupby(FORMATTED_ID_COLS,
                                    as_index=False)[VALUE_COLS].sum()

    # TESTS F0R CHECKING AGE GROUP IDS
    if check_ages:
        check_age_groups(df)

    # MORE TESTS FOR DEATHS - MAYBE THAT THEY AREN'T MORE THAN 1.25 THE
    # VALUE IN THE ENVELOPE BY LOCATION AGE YEAR SEX?

    # AND THEN WRITE A TABLE OF COMPARISONS OF DEATHS / ENVELOPE BY LOCATION
    # AGE YEAR SEX FOR REVIEW

    # MAKE NID METADATA TABLE
    if 'parent_nid' not in df.columns:
        df['parent_nid'] = np.nan

    if is_active is True:
        warnings.warn(
            """is_active is deprecated: use the update_nid_metadata_status
                         function to change the status of finalized datasets"""
        )

    # Use existing is_active and is_mort_active values, otherwise default to 0
    nid_map = pull_nid_metadata()
    df = df.merge(nid_map,
                  on=[
                      'nid', 'parent_nid', 'extract_type_id', 'source',
                      'data_type_id', 'code_system_id'
                  ],
                  how='left')

    df_na = df[pd.isnull(df['is_active'])]
    df_na = df_na[['nid', 'extract_type_id']].drop_duplicates()

    if df_na.shape[0] > 0:
        print("""New rows for the following NID/extract_type_id will be added
                 with is_active and is_mort_active = 0:\n {}""".format(df_na))

    df['is_active'] = df['is_active'].fillna(0)
    df['is_mort_active'] = df['is_mort_active'].fillna(0)

    # CHECK SUBNATIONAL LOCATIONS
    df = check_subnational_locations(df)

    # OVERRIDE REPRESENTATIVE ID FOR NON-VR
    df = adjust_representative_id(df)

    nid_meta_df = df[NID_META_COLS].drop_duplicates()
    nid_meta_df['last_formatted_timestamp'] = format_timestamp

    # MAKE NID LOCATION YEAR TABLE
    nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates()
    nid_locyears['last_formatted_timestamp'] = format_timestamp
    # check one iso3 per nid
    nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id')
    nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3)
    report_duplicates(
        nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(),
        ['nid', 'extract_type_id'])
    nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1)

    if write:
        # write nid metadata
        write_to_claude_nid_table(nid_meta_df,
                                  'claude_nid_metadata',
                                  replace=True,
                                  conn_def=conn_def)

        # write nid location-year map
        write_to_claude_nid_table(nid_locyears,
                                  'claude_nid_location_year',
                                  replace=True,
                                  conn_def=conn_def)

        # write to cod.source for new sources
        insert_source_id(source)

        nid_extracts = input_df[['nid', 'extract_type_id'
                                 ]].drop_duplicates().to_records(index=False)
        for nid, extract_type_id in nid_extracts:
            nid = int(nid)
            extract_type_id = int(extract_type_id)
            print("Writing nid {}, extract_type_id {}".format(
                nid, extract_type_id))
            idf = input_df.loc[(input_df['nid'] == nid) & (
                input_df['extract_type_id'] == extract_type_id)].copy()
            phase = 'formatted'
            launch_set_id = format_timestamp
            print("\nTotal deaths: {}".format(idf.deaths.sum()))
            write_phase_output(idf, phase, nid, extract_type_id, launch_set_id)

        # now refresh cache files for nid
        if refresh_cache:
            refresh_claude_nid_cache_files()

    return locals()