Python add_nid_metadata 예제들, cod_prep.downloaders.add_nid_metadata Python 예제들

예제 #1

0

파일 보기

    def get_computed_dataframe(self, df):
        """Replace acauses with those in the bridge map."""
        # VA sources are the only ones where this may not work
        # might need to split dataframe by data_type_id for bridge map
        df = add_nid_metadata(df, ['data_type_id'], **self.cache_options)
        has_verbal_autopsy = self.VA in df['data_type_id'].unique()
        df.drop(columns='data_type_id', inplace=True)

        if self.needs_bridging(has_verbal_autopsy):
            file_name = self.get_file_name(has_verbal_autopsy)
            map_df = pd.read_csv(self.bridge_map_path / file_name)
            map_df = map_df[['acause', 'bridge_code']]

            # add acause column to deaths data
            bridge_mapped = add_cause_metadata(
                df,
                ['acause'],
                merge_col='cause_id',
                cause_meta_df=self.cause_meta_df
            )
            # hack, this cause_id snuck in somehow...
            bridge_mapped.loc[
                bridge_mapped['cause_id'] == 606, 'acause'
            ] = 'gyne_femaleinfert'
            report_if_merge_fail(bridge_mapped, 'acause', 'cause_id')
            bridge_mapped.drop(['cause_id'], axis=1, inplace=True)

            # perform zz bridge code redistribution before other bridge mapping
            bridge_mapped = self.redistribute_zz_bridge_codes(bridge_mapped, map_df)

            bridge_mapped = bridge_mapped.merge(
                map_df, how='left', on='acause'
            )
            bridge_mapped = self.acause_to_bridge_code(bridge_mapped)
            # bring cause_id back
            bridge_mapped = add_cause_metadata(
                bridge_mapped,
                ['cause_id'],
                merge_col='acause',
                cause_meta_df=self.cause_meta_df
            )

            # hack, this cause_id snuck in
            bridge_mapped.loc[
                bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id'
            ] = 606
            report_if_merge_fail(bridge_mapped, 'cause_id', 'acause')
            # output diagnostic dataframe
            self.diag_df = bridge_mapped
            # drop unnecessary columns
            bridge_mapped = self.clean_up(bridge_mapped)
            return bridge_mapped
        else:
            self.diag_df = df
            df = self.clean_up(df)
            return df

예제 #2

0

파일 보기

    def get_computed_dataframe(self, df):
        """Replace acauses with those in the bridge map."""
        df = add_nid_metadata(df, ['data_type_id'], **self.cache_options)
        has_verbal_autopsy = self.VA in df['data_type_id'].unique()

        if self.needs_bridging(has_verbal_autopsy):
            sheet_name = self.get_sheet_name(has_verbal_autopsy)
            map_df = pd.read_excel(self.bridge_map_path, sheetname=sheet_name)
            map_df = map_df[['acause', 'bridge_code']]

            # add acause column to deaths data
            bridge_mapped = add_cause_metadata(
                df,
                ['acause'],
                merge_col='cause_id',
                cause_meta_df=self.cause_meta_df
            )
            # hack, this cause_id snuck in somehow...
            bridge_mapped.loc[
                bridge_mapped['cause_id'] == 606, 'acause'
            ] = 'gyne_femaleinfert'
            report_if_merge_fail(bridge_mapped, 'acause', 'cause_id')
            bridge_mapped.drop(['cause_id'], axis=1, inplace=True)
            bridge_mapped = bridge_mapped.merge(
                map_df, how='left', on='acause'
            )
            bridge_mapped = self.acause_to_bridge_code(bridge_mapped)
            # bring cause_id back
            bridge_mapped = add_cause_metadata(
                bridge_mapped,
                ['cause_id'],
                merge_col='acause',
                cause_meta_df=self.cause_meta_df
            )

            bridge_mapped.loc[
                bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id'
            ] = 606
            report_if_merge_fail(bridge_mapped, 'cause_id', 'acause')
            # output diagnostic dataframe
            self.diag_df = bridge_mapped
            # drop unnecessary columns
            bridge_mapped = self.clean_up(bridge_mapped)
            return bridge_mapped
        else:
            self.diag_df = df
            df = self.clean_up(df)
            return df

예제 #3

0

파일 보기

def add_metadata(df):
    """Add key metadata."""
    print_log_message("Adding key metadata")
    df = add_nid_metadata(df, ['source', 'code_system_id', 'parent_nid'],
                          force_rerun=False,
                          cache_dir='standard')
    # this column is not yet comprehensive in nid metadata file
    df['representative_id'] = 1
    report_if_merge_fail(df, 'source', 'nid')

    # map extract type
    df = map_extract_type_id(df)

    # map site
    df = map_site_id(df)

    return df

예제 #4

0

파일 보기

    def generate_splits(self, df):

        df = add_nid_metadata(
            df,
            add_cols='data_type_id',
            block_rerun=True,
            cache_dir=self.cache_dir,
            force_rerun=False,
        )
        df.loc[df['data_type_id'].isin([7, 5]), 'split_maternal'] = 1
        df.loc[df['split_maternal'].isnull(), 'split_maternal'] = 0
        df.loc[df['split_maternal'] == 0, 'pct_maternal'] = 1
        df.loc[df['split_maternal'] == 0,
               'pct_maternal_hiv'] = df['pct_maternal_hiv_vr']
        df.loc[df['split_maternal'] == 0, 'pct_hiv'] = 0
        df.drop('pct_maternal_hiv_vr', axis=1, inplace=True)
        return df

예제 #5

0

파일 보기

파일: hiv_maternal_pafs.py 프로젝트: cheth-rowe/ihmexp

    def generate_splits(self, df):
        """Create a column to indicate how the data should be split.

        (depends on source type)
        """
        df = add_nid_metadata(
            df,
            add_cols='data_type_id',
            block_rerun=True,
            cache_dir=self.cache_dir,
            force_rerun=False,
        )
        df.loc[df['data_type_id'].isin([7, 5]), 'split_maternal'] = 1
        df.loc[df['split_maternal'].isnull(), 'split_maternal'] = 0
        df.loc[df['split_maternal'] == 0, 'pct_maternal'] = 1
        df.loc[df['split_maternal'] == 0,
               'pct_maternal_hiv'] = df['pct_maternal_hiv_vr']
        df.loc[df['split_maternal'] == 0, 'pct_hiv'] = 0
        df.drop('pct_maternal_hiv_vr', axis=1, inplace=True)
        return df

예제 #6

0

파일 보기

    def special_cause_reassignment(self, df, code_system_id):
        """Replace the actual data cause under certain conditions.

        There are instances where a PI has good reason to
        believe that a certain group of deaths were assigned
        to the wrong cause, and it is known what cause to re-assign
        those deaths to. Implement here.

        This essentially allows mapping based on not just the cause
        and code system but based on other information like
        the location, NID, year, etc.

        It can also be used (sparingly) for hotfixes like
        changing all codes with values 'acause_digest_gastrititis'
        to be named 'acause_digest_gastritis'.

        Args:
            df (DataFrame): data with cause

        Returns:
            DataFrame: with any modifications
        """

        cache_args = {
            'force_rerun': False,
            'block_rerun': True,
            'cache_dir': 'standard',
            'cache_results': False
        }
        # Some SRS codes get redistributed differently than
        # other ICD10 datasets
        df = add_nid_metadata(
            df, 'source', **cache_args
        )

        if (df['source'] == "India_SRS_states_report").any():
            print_log_message("Changing SRS codes to custom garbage groups")
            assert (df['source'] == "India_SRS_states_report").all()

            df = add_code_metadata(
                df, 'value', code_system_id=code_system_id,
                **cache_args
            )

            custom_grbg = pd.read_csv(
                self.cg.get_resource("srs_custom_garbage_groups")
            )
            custom_grbg = custom_grbg.query('active == 1')
            custom_grbg['value'] = custom_grbg['srs_custom_garbage_group']
            custom_grbg = add_code_metadata(
                custom_grbg, 'code_id', code_system_id=code_system_id,
                merge_col='value', **cache_args
            )
            custom_grbg = custom_grbg.rename(
                columns={'code_id': 'new_code_id'})
            custom_grbg = custom_grbg[['package_id', 'new_code_id']]

            gp_dfs = []
            for package_id in custom_grbg.package_id.unique():
                # THIS QUERIES THE DATABASE - BUT THERE SHOULD NEVER BE A TON
                # OF SRS JOBS HAPPENING AT ONCE SO IT SHOULD BE OK
                gp_df = get_garbage_from_package(
                    code_system_id, package_id, package_arg_type="package_id"
                )
                assert len(gp_df) != 0, \
                    "Found 0 codes for package {}".format(package_id)
                gp_dfs.append(gp_df)
            gp_df = pd.concat(gp_dfs, ignore_index=True)

            gp_df = gp_df.merge(custom_grbg, how='left')
            report_if_merge_fail(gp_df, 'new_code_id', 'package_id')
            gp_df = gp_df[['value', 'new_code_id']]
            gp_df['value'] = gp_df['value'].str.strip()

            df = df.merge(gp_df, how='left', on='value')
            df.loc[df['new_code_id'].notnull(), 'code_id'] = df['new_code_id']
            df['code_id'] = df['code_id'].astype(int)
            df = df.drop(['new_code_id', 'value'], axis=1)

        df = df.drop('source', axis=1)

        china_cdc_2008 = (df['nid'] == 270005) & (df['extract_type_id'] == 2)
        # J96.00 - move five to four digit J96.0 (this should be a rule in formatting, only keep 4 digit detail)
        five_dig_code = df['code_id'] == 13243
        df.loc[
            china_cdc_2008 & five_dig_code,
            'code_id'
        ] = 13242

        return df

예제 #7

0

파일 보기

def main(years, qsub_out_dir, run_id, try_again=False):
    """
    This function runs all the other functions.
    """
    print("Submitting jobs...")
    for year in years:
        run_workers(year=year, run_id=run_id)
    print("Done submitting.")

    # wait for 20 minutes while checking for files
    print("Checking for files...")
    not_found = watch_jobs(years=years, data_dir=qsub_out_dir)

    # wait for all files to appear and maybe relaunch if they do not
    # if not all the file were found, and we want to try again,
    # delete the remaining jobs, and resubmit them.
    # deleting and THEN resubmitting is important, because
    # it prevents having two jobs altering the same file.
    if len(not_found) > 0 and try_again:
        print("Didn't find all files on the first try; Trying again...")

        # delete the remaining jobs
        print("Deleting remaining obsolete jobs...")
        qdel_obsolete_jobs()
        time.sleep(30)

        print("Re-submitting jobs that haven't yet completed...")
        for year in [x[:4] for x in not_found
                     ]:  # grab years from list of unfinished jobs
            run_workers(year=year, run_id=run_id)
        print("Done re-submitting.")

        print("Checking for files...")
        not_found = watch_jobs(years=years, data_dir=qsub_out_dir)
    # This will end the code from running. Within the context of the
    # empirical deaths run_all system, which will be checking for the output
    # of this master script, this means that the run all scipt won't find
    # the output file and will raise its own assertion error.
    assert len(
        not_found) == 0, "Not all files present, still missing {}".format(
            not_found)

    # still want to sleep some more incase some files are still writing
    time.sleep(30)

    # delete the remaining jobs
    print("Deleting remaining obsolete jobs...")
    qdel_obsolete_jobs()

    print("Collecting job outputs...")
    data = collect_qsub_results(data_dir=qsub_out_dir)

    # filter down to just the location_years we want
    location_years = get_location_years()
    data = filter_by_location_and_year(data, location_years)

    # add nid metadata
    data = add_nid_metadata(df=data,
                            add_cols=['source', 'is_active'],
                            force_rerun=False,
                            cache_dir='standard')

    # filter out duplicates
    data = filter_duplicates(data.copy())

    # aggregate under one for certain loc-years
    data = aggregate_under_one(data)

    # check that there isn't any All Cause VR in the data
    assert_msg = ("There is all cause VR in the data; "
                  "This will lead to duplicates later in the process")
    assert (data.extract_type_id != 167).all(), assert_msg
    print("Done!")

    for col in ['sex_id', 'age_group_id', 'year_id', 'location_id']:
        data[col] = data[col].astype(int)

    data = data.drop(['extract_type_id', 'site_id', 'is_active'], axis=1)

    return data

예제 #8

0

파일 보기

def get_model_data(model_group, location_hierarchy,
                   location_set_version_id, cause_meta_df):
    """Get data to run in NR model with incoming data."""
    iso3s = location_hierarchy.query('level == 3')['ihme_loc_id'].unique()
    regions = location_hierarchy.query('level == 2')['ihme_loc_id'].unique()
    super_region_ids = location_hierarchy.query(
        'level == 1')['location_id'].unique()

    # need to be string for later test that what comes after "VA-" is a
    # super region (otherwise, would have to compare ints, and whats after
    # "VA-" might not be convertible to an int)
    super_region_ids = [str(s) for s in super_region_ids]
    super_region_to_region_ids = location_hierarchy.query('level == 2')

    # location id here is the region id, and parent id is the super region id
    # becomes a dictionary from super region id to list of region ids
    super_region_to_region_ids = (
        super_region_to_region_ids[['location_id', 'parent_id']].groupby(
            'parent_id'
        ).apply(lambda df: list(set(df['location_id']))).to_dict()
    )

    regions_to_ids = location_hierarchy.query(
        'level == 2').set_index('ihme_loc_id')['region_id']

    level_three_location_ids = location_hierarchy.query(
        'level == 3')['location_id'].unique()

    model_group_filters = {}

    bad_model_group = False
    if model_group.startswith("VR-"):
        model_group_filters['data_type_id'] = [9, 10]
        loc_code = model_group.replace("VR-", "")
        if loc_code in iso3s:
            model_group_filters['iso3'] = loc_code
        elif loc_code in regions:
            region_id = regions_to_ids[loc_code]
            model_group_filters['region_id'] = region_id
            model_group_filters['exec_function'] = restrict_to_location_ids
            model_group_filters['exec_function_args'] = [
                level_three_location_ids
            ]
        elif loc_code == "GRL-AK":
            AK_LOC_ID = 524
            GRL_LOC_ID = 349
            model_group_filters['location_id'] = [AK_LOC_ID, GRL_LOC_ID]
        else:
            bad_model_group = True
    elif model_group.startswith("VA-"):
        model_group_filters['data_type_id'] = [8, 12]
        if model_group == "VA-SRS-IND":
            model_group_filters['source'] = IND_SRS_SOURCES
        elif model_group == "VA-SRS-IDN":
            model_group_filters['source'] = IDN_SRS_SOURCES
        elif model_group == "VA-Matlab":
            model_group_filters['source'] = MATLAB_SOURCES
        elif model_group == "VA-Nepal-Burden":
            model_group_filters['source'] = "Nepal_Burden_VA"
        elif model_group == "VA-IND":
            model_group_filters['iso3'] = "IND"
        elif model_group == "VA-158":
            # potential bug from GBD2016 - super region 158 keeps only
            # Pakistan, Nepal, and Bangledesh, doesn't get India data
            # Also keep Bhutan in case we ever have VA there
            model_group_filters['iso3'] = ['PAK', 'NPL', 'BGD', 'BTN']
        else:
            loc_code = model_group.replace("VA-", "")
            if loc_code in super_region_ids:
                super_region_id = int(loc_code)
                model_group_filters['region_id'] = \
                    super_region_to_region_ids[super_region_id]
            else:
                bad_model_group = True

    elif model_group == "Cancer_Registry":
        model_group_filters['source'] = "Cancer_Registry"

    # keep data by source/iso3/survey type
    # model groups follow MATERNAL-{source}-{iso3} format
    # except for the household surveys within Other_Maternal
    elif model_group.startswith("MATERNAL"):
        for source in MATERNAL_NR_SOURCES:
            if source in model_group:
                model_group_filters['source'] = source
        if "HH_SURVEYS" in model_group:
            model_group_filters['survey_type'] = ["DHS", "RHS", "AHS",
                                                  "DLHS", "NFHS"]
        model_group_filters['iso3'] = model_group[-3:]

    # special malaria model groups for VA data
    elif model_group.startswith('malaria'):
        model_group_filters['data_type_id'] = [8, 12]
        model_group_filters['malaria_model_group'] = model_group
        if "IND_SRS" in model_group:
            model_group_filters['source'] = IND_SRS_SOURCES
    elif model_group == "CHAMPS":
        model_group_filters['data_type_id'] = [12]
    else:
        bad_model_group = True
    if bad_model_group:
        raise AssertionError(
            "Unrecognized model group: {}".format(bad_model_group)
        )

    model_df = get_claude_data(
        phase="aggregation",
        is_active=True,
        is_dropped=False,
        location_set_id=35,
        year_id=range(1980, 2050),
        assert_all_available=True,
        location_set_version_id=location_set_version_id,
        **model_group_filters
    )

    add_cols = ['code_system_id']

    if model_group.startswith(("VA", "MATERNAL", "malaria", "CHAMPS")) or \
            model_group in ["VR-RUS", "VR-R9"]:
        add_cols.append('source')

    if model_group.startswith('MATERNAL-HH_SURVEYS'):
        model_df = add_survey_type(model_df)

    # add on code_system_id
    model_df = add_nid_metadata(
        model_df, add_cols, force_rerun=False, block_rerun=True,
        cache_dir='standard', cache_results=False
    )
    if model_group == "VR-RUS" or model_group == "VR-R9":
        # treat this like Russia_FMD_1989_1998 for purpose of cause list,
        # as it has now been bridge mapped that way
        replace_source = "Russia_FMD_ICD9"
        replace_csid = 213
        fmd_conv_10 = model_df['source'] == replace_source
        num_replace = len(model_df[fmd_conv_10])
        assert num_replace > 0, \
            "No rows found with source {} in " \
            "model group {}".format(replace_source, model_group)
        print_log_message(
            "Setting code system to {cs} for {s} "
            "source: {n} rows changed".format(
                cs=replace_csid, s=replace_source, n=num_replace)
        )
        model_df.loc[fmd_conv_10, 'code_system_id'] = replace_csid

    report_if_merge_fail(
        model_df, 'code_system_id', ['nid', 'extract_type_id']
    )

    # special source drops for certain groups
    model_df = drop_source_data(model_df, model_group, location_hierarchy,
                                cause_meta_df)

    return model_df

예제 #9

0

파일 보기

파일: mapping.py 프로젝트: zhouxm4/ihme-modeling

    def special_cause_reassignment(self, df, code_system_id):
        """Replace the actual data cause under certain conditions.

        This essentially allows mapping based on not just the cause
        and code system but based on other information like
        the location, NID, year, etc.

        Args:
            df (DataFrame): data with cause

        Returns:
            DataFrame: with any modifications
        """

        cache_args = {
            'force_rerun': False,
            'block_rerun': True,
            'cache_dir': 'standard',
            'cache_results': False
        }
        # Some SRS codes get redistributed differently than
        # other ICD10 datasets
        df = add_nid_metadata(df, 'source', **cache_args)

        if (df['source'] == "India_SRS_states_report").any():
            print_log_message("Changing SRS codes to custom garbage groups")
            assert (df['source'] == "India_SRS_states_report").all()

            df = add_code_metadata(df,
                                   'value',
                                   code_system_id=code_system_id,
                                   **cache_args)

            custom_grbg = pd.read_csv(
                self.cg.get_resource("srs_custom_garbage_groups"))
            custom_grbg = custom_grbg.query('active == 1')
            custom_grbg['value'] = custom_grbg['srs_custom_garbage_group']
            custom_grbg = add_code_metadata(custom_grbg,
                                            'code_id',
                                            code_system_id=code_system_id,
                                            merge_col='value',
                                            **cache_args)
            custom_grbg = custom_grbg.rename(
                columns={'code_id': 'new_code_id'})
            custom_grbg = custom_grbg[['package_id', 'new_code_id']]

            gp_dfs = []
            for package_id in custom_grbg.package_id.unique():
                gp_df = get_garbage_from_package(code_system_id,
                                                 package_id,
                                                 package_arg_type="package_id")
                assert len(gp_df) != 0, \
                    "Found 0 codes for package {}".format(package_id)
                gp_dfs.append(gp_df)
            gp_df = pd.concat(gp_dfs, ignore_index=True)

            gp_df = gp_df.merge(custom_grbg, how='left')
            report_if_merge_fail(gp_df, 'new_code_id', 'package_id')
            gp_df = gp_df[['value', 'new_code_id']]
            gp_df['value'] = gp_df['value'].str.strip()

            df = df.merge(gp_df, how='left', on='value')
            df.loc[df['new_code_id'].notnull(), 'code_id'] = df['new_code_id']
            df['code_id'] = df['code_id'].astype(int)
            df = df.drop(['new_code_id', 'value'], axis=1)

        df = df.drop('source', axis=1)

        china_cdc_2008 = (df['nid'] == 270005) & (df['extract_type_id'] == 2)

        five_dig_code = df['code_id'] == 13243
        df.loc[china_cdc_2008 & five_dig_code, 'code_id'] = 13242

        return df

예제 #10

0

파일 보기

def get_model_data(model_group, location_hierarchy, location_set_version_id,
                   cause_meta_df):
    """Get data to run in NR model with incoming data."""
    iso3s = location_hierarchy.query('level == 3')['ihme_loc_id'].unique()
    regions = location_hierarchy.query('level == 2')['ihme_loc_id'].unique()
    super_region_ids = location_hierarchy.query(
        'level == 1')['location_id'].unique()

    super_region_ids = [str(s) for s in super_region_ids]
    super_region_to_region_ids = location_hierarchy.query('level == 2')

    super_region_to_region_ids = (super_region_to_region_ids[[
        'location_id', 'parent_id'
    ]].groupby('parent_id').apply(
        lambda df: list(set(df['location_id']))).to_dict())

    regions_to_ids = location_hierarchy.query('level == 2').set_index(
        'ihme_loc_id')['region_id']

    level_three_location_ids = location_hierarchy.query(
        'level == 3')['location_id'].unique()

    model_group_filters = {}

    bad_model_group = False
    if model_group.startswith("VR-"):
        model_group_filters['data_type_id'] = [9, 10]
        loc_code = model_group.replace("VR-", "")
        if loc_code in iso3s:
            model_group_filters['iso3'] = loc_code
        elif loc_code in regions:
            region_id = regions_to_ids[loc_code]
            model_group_filters['region_id'] = region_id
            model_group_filters['exec_function'] = restrict_to_location_ids
            model_group_filters['exec_function_args'] = [
                level_three_location_ids
            ]
        elif loc_code == "GRL-AK":
            AK_LOC_ID = 524
            GRL_LOC_ID = 349
            model_group_filters['location_id'] = [AK_LOC_ID, GRL_LOC_ID]
        else:
            bad_model_group = True
    elif model_group.startswith("VA-"):
        model_group_filters['data_type_id'] = 8
        if model_group == "VA-SRS-IND":
            model_group_filters['source'] = IND_SRS_SOURCES
        elif model_group == "VA-SRS-IDN":
            model_group_filters['source'] = IDN_SRS_SOURCES
        elif model_group == "VA-Matlab":
            model_group_filters['source'] = MATLAB_SOURCES
        elif model_group == "VA-IND":
            model_group_filters['iso3'] = "IND"
        elif model_group == "VA-158":
            model_group_filters['iso3'] = ['PAK', 'NPL', 'BGD']
        else:
            loc_code = model_group.replace("VA-", "")
            if loc_code in super_region_ids:
                super_region_id = int(loc_code)
                model_group_filters['region_id'] = \
                    super_region_to_region_ids[super_region_id]
            else:
                bad_model_group = True

    elif model_group == "Cancer_Registry":
        model_group_filters['source'] = "Cancer_Registry"

    elif model_group.startswith("MATERNAL"):
        for source in MATERNAL_NR_SOURCES:
            if source in model_group:
                model_group_filters['source'] = source
        if "HH_SURVEYS" in model_group:
            model_group_filters['survey_type'] = [
                "DHS", "RHS", "AHS", "DLHS", "NFHS"
            ]
        model_group_filters['iso3'] = model_group[-3:]

    elif model_group.startswith('malaria'):
        model_group_filters['data_type_id'] = 8
        model_group_filters['malaria_model_group'] = model_group
        if "IND_SRS" in model_group:
            model_group_filters['source'] = IND_SRS_SOURCES
    else:
        bad_model_group = True
    if bad_model_group:
        raise AssertionError(
            "Unrecognized model group: {}".format(bad_model_group))

    model_df = get_claude_data(phase="aggregation",
                               is_active=True,
                               is_dropped=False,
                               location_set_id=35,
                               year_id=range(1980, 2050),
                               assert_all_available=True,
                               location_set_version_id=location_set_version_id,
                               **model_group_filters)

    add_cols = ['code_system_id']

    if model_group.startswith("VA") or model_group.startswith("MATERNAL") or \
            model_group in ["VR-RUS", "VR-R9"] or model_group.startswith('malaria'):
        add_cols.append('source')

    if model_group.startswith('MATERNAL-HH_SURVEYS'):
        model_df = add_survey_type(model_df)

    # add on code_system_id
    model_df = add_nid_metadata(model_df,
                                add_cols,
                                force_rerun=False,
                                block_rerun=True,
                                cache_dir='standard',
                                cache_results=False)
    if model_group == "VR-RUS" or model_group == "VR-R9":

        replace_source = "Russia_FMD_ICD9"
        replace_csid = 213
        fmd_conv_10 = model_df['source'] == replace_source
        num_replace = len(model_df[fmd_conv_10])
        assert num_replace > 0, \
            "No rows found with source {} in " \
            "model group {}".format(replace_source, model_group)
        print_log_message("Setting code system to {cs} for {s} "
                          "source: {n} rows changed".format(cs=replace_csid,
                                                            s=replace_source,
                                                            n=num_replace))
        model_df.loc[fmd_conv_10, 'code_system_id'] = replace_csid

    report_if_merge_fail(model_df, 'code_system_id',
                         ['nid', 'extract_type_id'])

    # special source drops for certain groups
    model_df = drop_source_data(model_df, model_group, location_hierarchy,
                                cause_meta_df)

    return model_df