Ejemplo n.º 1
0
def get_package_code_ids(regression_specification, code_system_id):
    """Returns code_ids for garbage codes in package for given code system"""
    package_description = regression_specification[
        'package_descriptions'
    ][code_system_id]

    packages = get_package_list(code_system_id)
    package_id = packages.loc[
        packages['package_description'] == package_description,
        'package_id'
    ]
    assert len(package_id) == 1
    package_id = package_id.iloc[0]

    pkg = json.load(open(
        "FILEPATH".format(code_system_id, package_id)))
    garbage_codes = list(pkg['garbage_codes'])

    code_map = get_cause_map(code_system_id=code_system_id, force_rerun=False)
    code_map['value'] = code_map['value'].str.replace(".", "")
    is_package_garbage = code_map['value'].isin(garbage_codes)

    garbage_code_ids = list(code_map.loc[
        is_package_garbage,
        'code_id'
    ].unique())

    return garbage_code_ids
Ejemplo n.º 2
0
 def get_code_ids_from_map_ids(self, map_id):
     cs_map = get_cause_map(code_map_version_id=self.code_map_version_id,
                            **self.block_rerun)
     pkg_map = get_clean_package_map_for_misdc(
         self.code_system_id, remove_decimal=self.remove_decimal)
     assert type(map_id) == str
     if map_id.startswith('_p_'):
         values = pkg_map.loc[pkg_map['map_id'] == map_id, 'value'].values
         codes = cs_map.loc[cs_map.value.isin(values), 'code_id'].values
         cause_id = 743
         assert len(
             codes) > 0, "No code_ids matching {} in the cause map".format(
                 map_id)
     else:
         codes = cs_map.loc[cs_map.cause_id == int(map_id),
                            'code_id'].values
         cause_id = int(map_id)
         if len(codes) == 0:
             codes = cs_map.loc[cs_map.cause_id == self.cc_code,
                                'code_id'].values
             cause_id = self.cc_code
     code_id = codes[0]
     code_dict = {map_id: code_id}
     cause_dict = {map_id: cause_id}
     return code_dict, cause_dict
def merge_on_scaled(df, move_df, adjust_id, code_system_id):

    df = df.merge(move_df[[
        'location_id', 'year_id', 'site_id', 'age_group_id', 'sex_id',
        'map_id', 'misdiagnosed_scaled'
    ]],
                  how='outer')
    if len(df.loc[df.cause_id.isnull()]) > 0:
        assert all(
            df.loc[df.cause_id.isnull(),
                   'map_id'].values == str(adjust_id)), 'Other missing map_ids'
        cs_map = get_cause_map(code_system_id=code_system_id,
                               force_rerun=False,
                               block_rerun=True)
        possible_codes = cs_map.loc[cs_map.cause_id == adjust_id,
                                    'code_id'].values
        use_target = True
        if len(possible_codes) == 0:
            possible_codes = cs_map.loc[cs_map.cause_id == 919,
                                        'code_id'].values
            use_target = False
        target_code = possible_codes[0]
        df.loc[df.code_id.isnull(), 'code_id'] = target_code
        if use_target:
            df.loc[df.cause_id.isnull(), 'cause_id'] = adjust_id
        else:
            df.loc[df.cause_id.isnull(), 'cause_id'] = 919
        df['deaths'].fillna(0, inplace=True)
        for exravar in ['nid', 'extract_type_id']:
            df[exravar].fillna(method='pad', inplace=True)
        for idvar in [i for i in list(df) if i.endswith('_id')] + ['nid']:
            if df[idvar].dtype == 'float64':
                df[idvar] = df[idvar].astype(int)

    return df
Ejemplo n.º 4
0
def format_source(release_date):
    # read the raw data and the WHO provided country/year map
    df = read_data(release_date)
    country_map = get_country_map(release_date)

    # subset to just the new loc/years
    # also apply location/year restrictions
    df = subset_location_years(df, country_map)

    # map location information
    loc_meta = get_current_location_hierarchy(
        location_set_id=CONF.get_id('location_set'),
        location_set_version_id=CONF.get_id('location_set_version'),
        force_rerun=False,
        block_rerun=True)
    df = get_gbd_locations(df, country_map, loc_meta)

    # replicating age adjustments for WHO data from
    df = adjust_WHO_ages(df)

    # Limit the dataframe to the columns needed and melt ages wide to long
    df = melt_df(df)

    # assign age group ids
    df = get_age_group_ids(df)

    # map code ids and apply special remaps
    cause_map = get_cause_map(1, force_rerun=False, block_rerun=True)
    df = map_code_id(df, cause_map)

    # add manual cols and cleanup
    df = cleanup(df)

    # apply nids
    df = map_nids(df, release_date)

    # apply any final special adjustments
    df = apply_special_adjustments(df)

    # final grouping and finalize formatting
    df = df[FINAL_FORMATTED_COLS]
    assert df.notnull().values.all()
    df = df.groupby(ID_COLS, as_index=False)[VALUE_COL].sum()

    # run finalize formatting
    locals_present = finalize_formatting(df, SYSTEM_SOURCE, write=WRITE)
    nid_meta_df = locals_present['nid_meta_df']

    # update nid metadata status
    if WRITE:
        nid_extracts = nid_meta_df[[
            'nid', 'extract_type_id'
        ]].drop_duplicates().to_records(index=False)
        for nid, extract_type_id in nid_extracts:
            nid = int(nid)
            extract_type_id = int(extract_type_id)
            update_nid_metadata_status(nid,
                                       extract_type_id,
                                       is_active=IS_ACTIVE,
                                       is_mort_active=IS_MORT_ACTIVE)
Ejemplo n.º 5
0
def format_greenland():
    grl_14 = pd.read_excel(GRL_PATH_2014)
    grl_15 = pd.read_excel(GRL_PATH_2015)
    grl_14 = grl_14[['Year', 'Sex', 'ICD-10', 'Age', 'Deaths']]
    assert (grl_14.columns.values == grl_15.columns.values).all()
    df = pd.concat([grl_14, grl_15])

    df = clean_df(df)

    df = get_sex_id(df)

    df = get_nid(df)

    df = get_age_group_id(df)

    df = fix_codes(df)

    cause_map = get_cause_map(code_system_id=1)
    df = map_code_id(df, cause_map)

    df = df[FINAL_FORMATTED_COLS]
    df = df.groupby(ID_COLS, as_index=False)[VALUE_COL].sum()

    system_source = 'Greenland_BoH_ICD10'

    finalize_formatting(df, system_source, write=True)
Ejemplo n.º 6
0
 def prep_code_metadata(self):
     df = get_cause_map(
         self.code_system_id,
         **self.standard_cache_options
     )
     df = df[['code_id', 'value', 'cause_id']]
     df = df.rename(columns={'value': 'raw_cause'})
     return df
Ejemplo n.º 7
0
def format_sri_lanka():
    #importing raw data from excel file
    df = pd.read_excel(path)

    #initial cleaning to fix rows/columns imported from excel
    df = clean_df(df)

    #incoming data has sex data in wide format, following function splits df by sex and manually sets age groups
    #function returns initial_total (a float to compare against deaths later to ensure no deaths were lost in process)
    df, initial_total = split_sexes(df)

    #reshaping df age groups wide to long and assuring no deaths were lost
    df = pd.melt(df,
                 id_vars=['cause_name', 'sex_id', 'value'],
                 var_name='age',
                 value_name='deaths')
    assert np.allclose(initial_total, df.deaths.sum())

    df = get_age_ids(df)

    #importing and formatting rdp_frac dataframe to disaggregate tabulated icd10
    rdp = pd.read_stata(rdp_path)
    rdp = format_rdp_frac(rdp)

    #disaggregating tabulated icd10 codes
    df = disaggregate(df, rdp)

    #mapping code_ids using cause map from engine room
    cause_map = get_cause_map(code_system_id=9)
    df = map_code_id(df, cause_map)

    #addition of manually added columns
    #Sri Lanka location id 17
    df['location_id'] = 17
    #nid 327524
    df['nid'] = 327524
    #data_type_id 9 (VR)
    df['data_type_id'] = 9
    #code_system_id 9 (ICD10_tabulated)
    df['code_system_id'] = 9
    #year: 2013, site: blank, representative_id: 1
    df['year_id'] = 2013
    df['site'] = ""
    df['representative_id'] = 1

    #grouping by ID_COLS and assigning system source
    df = df[FINAL_FORMATTED_COLS]
    df = df.groupby(ID_COLS, as_index=False)[VALUE_COL].sum()

    system_source = "ICD10_tabulated"

    #df.to_csv('/home/j/temp/mwcunnin/test_lka_out.csv', index = False, encoding = 'utf8')

    #run finalize formatting
    finalize_formatting(df, system_source, write=True)
def format_sri_lanka():
    df = read_and_clean_data()

    # incoming data has sex data in wide format, following function splits df by
    # sex and manually sets age groups
    # function returns initial_total (a float to compare against deaths later to
    # ensure no deaths were lost in process)
    df, initial_total = split_sexes(df)

    # reshaping df age groups wide to long and assuring no deaths were lost
    df = pd.melt(df,
                 id_vars=['cause_name', 'sex_id', 'value', 'year_id'],
                 var_name='age',
                 value_name='deaths')
    assert np.allclose(initial_total, df.deaths.sum())

    df = get_age_ids(df)

    # importing and formatting rdp_frac dataframe to disaggregate tabulated icd10
    rdp = pd.read_stata(rdp_path)
    rdp = format_rdp_frac(rdp)

    # disaggregating tabulated icd10 codes
    df = disaggregate(df, rdp)

    # mapping code_ids using cause map from engine room
    cause_map = get_cause_map(code_system_id=9)
    df = map_code_id(df, cause_map)

    # addition of manually added columns
    # Sri Lanka location id 17
    df['location_id'] = 17
    # nid 327524
    df['nid'] = df.year_id.map({2007: 272959, 2013: 327524})
    # data_type_id 9 (VR)
    df['data_type_id'] = 9
    # code_system_id 9 (ICD10_tabulated)
    df['code_system_id'] = 9
    # site: blank, representative_id: 1
    df['site'] = ""
    df['representative_id'] = 1

    # grouping by ID_COLS and assigning system source
    df = df[FINAL_FORMATTED_COLS]
    assert df.notnull().values.all()
    df[INT_COLS] = df[INT_COLS].astype(int)
    df = df.groupby(ID_COLS, as_index=False)[VALUE_COL].sum()

    system_source = "ICD10_tabulated"

    # run finalize formatting
    finalize_formatting(df, system_source, write=WRITE)
    return df
Ejemplo n.º 9
0
    def get_computed_dataframe(self, df):
        """Return mapped dataframe."""
        # list of all cause columns
        raw_cause_cols = MCoDMapper.get_code_columns(df)
        df = MCoDMapper.fix_icd_codes(df, raw_cause_cols, self.code_system_id)

        print_log_message("Mapping underlying cause/primary diagnosis")
        cause_map = get_cause_map(code_map_version_id=self.code_map_version_id,
                                  **self.cache_options)
        code_map = MCoDMapper.prep_cause_map(cause_map)
        df['cause_mapped'] = df['cause'].map(code_map)

        print_log_message(
            "Trimming ICD codes and remapping underlying cause/primary diagnosis"
        )
        df = MCoDMapper.trim_and_remap(df, {'cause': 'cause_mapped'}, code_map,
                                       self.code_system_id)
        report_if_merge_fail(df, 'cause_mapped', 'cause')

        # merge on the cause_id for the underlying cause
        df = df.rename(columns={'cause_mapped': 'code_id'})
        df['code_id'] = df['code_id'].astype(int)
        df = add_code_metadata(df,
                               'cause_id',
                               code_map_version_id=self.code_map_version_id,
                               **self.cache_options)
        report_if_merge_fail(df, 'cause_id', 'code_id')

        print_log_message("Mapping chain causes")
        # get the special intermediate cause map
        int_cause_map = self.prep_int_cause_map()
        df = MCoDMapper.map_cause_codes(df, int_cause_map, self.int_cause)

        print_log_message("Trimming ICD codes and remapping chain causes")
        int_cause_cols = [x for x in df.columns if self.int_cause in x]
        int_cause_col_dict = MCoDMapper.prep_raw_mapped_cause_dictionary(
            raw_cause_cols, int_cause_cols)
        df = MCoDMapper.trim_and_remap(df, int_cause_col_dict, int_cause_map,
                                       self.code_system_id)

        print_log_message(
            "Identifying rows with intermediate cause of interest")
        df = self.capture_int_cause(df, int_cause_cols)
        if not self.drop_p2:
            df = self.set_part2_flag(df)

        return df
def run_pipeline(nid,
                 extract_type_id,
                 launch_set_id,
                 df,
                 code_system_id,
                 cause_set_version_id,
                 location_set_version_id,
                 pop_run_id,
                 env_run_id,
                 distribution_set_version_id,
                 diagnostic=False):
    """Run the full pipeline"""

    cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_results': False,
        'cache_dir': CONF.get_directory('FILEPATH'),
        'verbose': False
    }

    location_meta_df = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id, **cache_options)

    code_map = get_cause_map(code_system_id=code_system_id, **cache_options)

    source = get_value_from_nid(nid, "source", extract_type_id)

    print("Overriding causes when necessary")
    df = overrides(df, location_meta_df)

    print("Dropping data out of scope")
    df = drop_data_out_of_scope(df, location_meta_df, source)
    if len(df) > 0:
        # make sure six minor territories are grouped correctly
        assert_no_six_minor_territories(df)

        # run mapping
        print("\nDeaths before MAPPING: {}".format(df.deaths.sum()))
        Mapper = GBDCauseMapper(cause_set_version_id, code_map)
        df = Mapper.get_computed_dataframe(df, code_system_id)
        if diagnostic:
            write_phase_output(df,
                               'mapping',
                               nid,
                               extract_type_id,
                               launch_set_id,
                               sub_dirs='diagnostic')

        print("\nDeaths before AGESEXSPLIT: {}".format(df.deaths.sum()))
        # run age sex splitting
        MySplitter = AgeSexSplitter(cause_set_version_id,
                                    pop_run_id,
                                    distribution_set_version_id,
                                    verbose=True,
                                    collect_diagnostics=False)

        df = MySplitter.get_computed_dataframe(df, location_meta_df)
        if diagnostic:
            diag_df = MySplitter.get_diagnostic_dataframe()
            write_phase_output(diag_df,
                               'agesexsplit',
                               nid,
                               extract_type_id,
                               launch_set_id,
                               sub_dirs='diagnostic')

        print("\nDeaths before CORRECTIONS: {}".format(df.deaths.sum()))
        # run restrictions corrections
        Corrector = RestrictionsCorrector(code_system_id,
                                          cause_set_version_id,
                                          collect_diagnostics=False,
                                          verbose=True)
        df = Corrector.get_computed_dataframe(df)

        # calculate cc_code for some sources
        if source in ['Iran_maternal_surveillance', 'Iran_forensic']:
            env_meta_df = get_env(env_run_id=env_run_id, **cache_options)
            df = calculate_cc_code(df, env_meta_df, code_map)
            print("\nDeaths after adding cc_code: {}".format(df.deaths.sum()))

        # adjust deaths for New Zealand by maori/non-maori ethnicities
        if source in ["NZL_MOH_ICD9", "NZL_MOH_ICD10"]:
            df = correct_maori_non_maori_deaths(df)
            print("\nDeaths after Maori/non-Maori adjustment: {}".format(
                df.deaths.sum()))

        print("\nDeaths at END: {}".format(df.deaths.sum()))

    return df
def run_phase(df,
              csvid,
              nid,
              extract_type_id,
              lsvid,
              pop_run_id,
              cmvid,
              launch_set_id,
              remove_decimal,
              write_diagnostics=True):

    read_file_cache_options = {
        'block_rerun': True,
        'cache_dir': CACHE_DIR,
        'force_rerun': False,
        'cache_results': False
    }

    iso3 = get_value_from_nid(nid,
                              'iso3',
                              extract_type_id=extract_type_id,
                              location_set_version_id=lsvid)

    code_system_id = int(
        get_value_from_nid(nid,
                           'code_system_id',
                           extract_type_id=extract_type_id))

    data_type_id = get_value_from_nid(nid,
                                      'data_type_id',
                                      extract_type_id=extract_type_id)

    cause_map = get_cause_map(code_map_version_id=cmvid,
                              **read_file_cache_options)

    orig_deaths_sum = int(df['deaths'].sum())

    if remove_decimal:
        print_log_message("Removing decimal from code map")
        cause_map['value'] = cause_map['value'].apply(
            lambda x: x.replace(".", ""))

    if needs_garbage_correction(iso3, data_type_id):
        print_log_message("Correcting Garbage for {}".format(iso3))
        orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())

        cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid,
                                                    **read_file_cache_options)

        age_meta_df = get_ages(**read_file_cache_options)

        loc_meta_df = get_current_location_hierarchy(
            location_set_version_id=lsvid, **read_file_cache_options)

        pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options)

        hiv_corrector = HIVCorrector(df,
                                     iso3,
                                     code_system_id,
                                     pop_meta_df,
                                     cause_meta_df,
                                     loc_meta_df,
                                     age_meta_df,
                                     correct_garbage=True)
        df = hiv_corrector.get_computed_dataframe()
        after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())
        after_deaths_sum = int(df['deaths'].sum())
        print_log_message("""
            Stage [gc deaths / total deaths]
            Before GC correction [{gco} / {to}]
            After GC correction [{gca} / {ta}]
        """.format(gco=orig_gc_sum,
                   to=orig_deaths_sum,
                   gca=after_gc_sum,
                   ta=after_deaths_sum))

    df = add_code_metadata(df, ['value', 'code_system_id'],
                           code_map=cause_map,
                           **read_file_cache_options)

    assert (df['code_system_id'] == code_system_id).all(), "Variable code " \
        "system id {} did not agree with all values of df code " \
        "system id: \n{}".format(
            code_system_id, df.loc[df['code_system_id'] != code_system_id])

    print_log_message("Formatting data for redistribution")

    df = format_age_groups(df)
    # drop observations with 0 deaths
    df = drop_zero_deaths(df)
    # merge on redistribution location hierarchy
    df = add_rd_locations(df, lsvid)
    # fill in any missing stuff that may have come from rd hierarchy
    df = fill_missing_df(df, verify_all=True)

    df = add_split_group_id_column(df)

    # final check to make sure we have all the necessary columns
    df = format_columns_for_rd(df, code_system_id)

    split_groups = list(df.split_group.unique())
    parallel = len(split_groups) > 1

    print_log_message("Submitting/Running split groups")
    for split_group in split_groups:
        # remove intermediate files from previous run
        delete_split_group_output(nid, extract_type_id, split_group)
        # save to file
        split_df = df.loc[df['split_group'] == split_group]
        write_split_group_input(split_df, nid, extract_type_id, split_group)
        if parallel:
            submit_split_group(nid, extract_type_id, split_group,
                               code_system_id, launch_set_id)
        else:
            worker_main(nid, extract_type_id, split_group, code_system_id)
    if parallel:
        print_log_message("Waiting for splits to complete...")
        wait('claude_redistributionworker_{}'.format(nid), 30)
        print_log_message("Done waiting. Appending them together")
    df = read_append_split_groups(split_groups, nid, extract_type_id,
                                  cause_map)

    print_log_message("Done appending files - {} rows assembled".format(
        len(df)))
    df = revert_variables(df)

    after_deaths_sum = int(df['deaths'].sum())
    before_after_text = """
        Before GC redistribution: {a}
        After GC redistribution: {b}
    """.format(a=orig_deaths_sum, b=after_deaths_sum)
    diff = abs(orig_deaths_sum - after_deaths_sum)

    diff_threshold = max(.02 * orig_deaths_sum, 5)
    if not diff < diff_threshold:
        raise AssertionError("Deaths not close.\n" + before_after_text)
    else:
        print_log_message(before_after_text)

    return df
Ejemplo n.º 12
0
def run_phase(df,
              csvid,
              nid,
              extract_type_id,
              lsvid,
              pop_run_id,
              cmvid,
              launch_set_id,
              remove_decimal,
              write_diagnostics=True):
    """String together processes for redistribution."""

    # what to do about caching throughout the phase
    read_file_cache_options = {
        'block_rerun': True,
        'cache_dir': CACHE_DIR,
        'force_rerun': False,
        'cache_results': False
    }

    # the iso3 of this data
    iso3 = get_value_from_nid(nid,
                              'iso3',
                              extract_type_id=extract_type_id,
                              location_set_version_id=lsvid)

    # the code system id
    code_system_id = int(
        get_value_from_nid(nid,
                           'code_system_id',
                           extract_type_id=extract_type_id))

    # the data type
    data_type_id = get_value_from_nid(nid,
                                      'data_type_id',
                                      extract_type_id=extract_type_id)

    # cause map
    cause_map = get_cause_map(code_map_version_id=cmvid,
                              **read_file_cache_options)

    orig_deaths_sum = int(df['deaths'].sum())

    if remove_decimal:
        print_log_message("Removing decimal from code map")
        cause_map['value'] = cause_map['value'].apply(
            lambda x: x.replace(".", ""))

    if needs_garbage_correction(iso3, data_type_id):
        print_log_message("Correcting Garbage for {}".format(iso3))
        orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())

        cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid,
                                                    **read_file_cache_options)

        # get age group ids
        age_meta_df = get_ages(**read_file_cache_options)

        loc_meta_df = get_current_location_hierarchy(
            location_set_version_id=lsvid, **read_file_cache_options)

        pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options)
        # Move garbage to hiv first
        hiv_corrector = HIVCorrector(df,
                                     iso3,
                                     code_system_id,
                                     pop_meta_df,
                                     cause_meta_df,
                                     loc_meta_df,
                                     age_meta_df,
                                     correct_garbage=True)
        df = hiv_corrector.get_computed_dataframe()
        after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())
        after_deaths_sum = int(df['deaths'].sum())
        print_log_message("""
            Stage [gc deaths / total deaths]
            Before GC correction [{gco} / {to}]
            After GC correction [{gca} / {ta}]
        """.format(gco=orig_gc_sum,
                   to=orig_deaths_sum,
                   gca=after_gc_sum,
                   ta=after_deaths_sum))

    df = add_code_metadata(df, ['value', 'code_system_id'],
                           code_map=cause_map,
                           **read_file_cache_options)
    # recognizing that it is weird for code_system_id to come from two places,
    # make sure they are consistent
    assert (df['code_system_id'] == code_system_id).all(), "Variable code " \
        "system id {} did not agree with all values of df code " \
        "system id: \n{}".format(
            code_system_id, df.loc[df['code_system_id'] != code_system_id])

    print_log_message("Formatting data for redistribution")
    # do we have all the packages we need?
    # verify_packages(df)
    # format age groups to match package parameters
    df = format_age_groups(df)
    # drop observations with 0 deaths
    df = drop_zero_deaths(df)
    # merge on redistribution location hierarchy
    df = add_rd_locations(df, lsvid)
    # fill in any missing stuff that may have come from rd hierarchy
    df = fill_missing_df(df, verify_all=True)
    # create split groups

    # NO SPLIT GROUP NEEDED
    df = add_split_group_id_column(df)

    # final check to make sure we have all the necessary columns
    df = format_columns_for_rd(df, code_system_id)

    split_groups = list(df.split_group.unique())
    parallel = len(split_groups) > 1

    print_log_message("Submitting/Running split groups")
    for split_group in split_groups:
        # remove intermediate files from previous run
        delete_split_group_output(nid, extract_type_id, split_group)
        # save to file
        split_df = df.loc[df['split_group'] == split_group]
        write_split_group_input(split_df, nid, extract_type_id, split_group)
        # submit jobs or just run them here
        if parallel:
            submit_split_group(nid, extract_type_id, split_group,
                               code_system_id, launch_set_id)
        else:
            worker_main(nid, extract_type_id, split_group, code_system_id)
    if parallel:
        print_log_message("Waiting for splits to complete...")
        # wait until all jobs for a given nid have completed
        # eventually need logic for files not being present
        wait('claude_redistributionworker_{}'.format(nid), 30)
        # This seems to be necessary to wait for files
        print_log_message("Done waiting. Appending them together")
    # append split groups together
    df = read_append_split_groups(split_groups, nid, extract_type_id,
                                  cause_map)

    print_log_message("Done appending files - {} rows assembled".format(
        len(df)))
    df = revert_variables(df)

    after_deaths_sum = int(df['deaths'].sum())
    before_after_text = """
        Before GC redistribution: {a}
        After GC redistribution: {b}
    """.format(a=orig_deaths_sum, b=after_deaths_sum)
    diff = abs(orig_deaths_sum - after_deaths_sum)
    # bad if change 2% or 5 deaths, whichever is greater
    # (somewhat arbitrary, just trying to avoid annoying/non-issue failures)
    diff_threshold = max(.02 * orig_deaths_sum, 5)
    if not diff < diff_threshold:
        raise AssertionError("Deaths not close.\n" + before_after_text)
    else:
        print_log_message(before_after_text)

    return df