Exemple #1
0
def get_all_cause_vr(location_set_version_id):
    """Pull age/sex split VR with all cause mortality."""
    all_cause_filters = {'extract_type_id': [167], 'location_set_id': 35}
    print_log_message("Pulling all cause VR")
    df = get_claude_data("formatted",
                         location_set_version_id=location_set_version_id,
                         **all_cause_filters)
    df = _collapse_to_group_cols(df)
    return df
def run_phase(nid, extract_type_id, launch_set_id, data_type_id, source,
              model_group, malaria_model_group):

    run_noise_reduction = True
    run_by_cause = False

    # determine above values using the source and model group
    if model_group == "NO_NR":
        run_noise_reduction = False

    if model_group_is_run_by_cause(model_group):
        run_by_cause = True

    if run_noise_reduction:
        if run_by_cause:

            filepath = "FILEPATH".format(nr=NR_DIR,
                                         model_group=model_group,
                                         lsid=launch_set_id)
            causes = sorted(list(pd.read_csv(filepath)['cause_id'].unique()))
            print_log_message("Reading cause-specific files".format(
                len(causes)))
        else:
            causes = [None]
            print_log_message("Reducing cause-appended file")

        dfs = []
        for cause_id in causes:
            df = get_noise_reduction_model_result(nid,
                                                  extract_type_id,
                                                  launch_set_id,
                                                  model_group,
                                                  malaria_model_group,
                                                  cause_id=cause_id)
            if 'Unnamed: 0' in df.columns:
                df = df.drop('Unnamed: 0', axis=1)
            dfs.append(df)
        df = pd.concat(dfs, ignore_index=True)

        print_log_message(
            "Running bayesian noise reduction algorithm using fitted priors")
        noise_reducer = NoiseReducer(data_type_id, source)
        df = noise_reducer.get_computed_dataframe(df)

    else:
        # simply get the aggregated result
        print_log_message("Skipping noise reduction for source {} and model "
                          "group {}".format(source, model_group))
        df = get_claude_data("aggregation",
                             nid=nid,
                             extract_type_id=extract_type_id)

    return df
Exemple #3
0
def get_cod_vr(location_set_version_id, vr_filter=None):
    """Pull age/sex split VR with cause of death data."""
    dataset_filters = {
        'data_type_id': [9, 10],
        'location_set_id': 35,
        'is_active': True,
    }
    # optionally, only get VR data for specific sources/nids/iso3s/etc.
    if vr_filter is not None:
        dataset_filters.update(vr_filter)
    print_log_message("Pulling CoD VR")
    df = get_claude_data("disaggregation",
                         location_set_version_id=location_set_version_id,
                         **dataset_filters)
    df = _collapse_to_group_cols(df)
    return df
Exemple #4
0
    def get_va_vr_sourcemetadata(self):
        """Pull VA + VR (and sample VR) + CHAMPS source metadata."""
        df = get_claude_data(phase="sourcemetadata", **self.dataset_filters)

        # get rid of northern ireland and wales in 1980, nothing matches
        # get rid of GBR 1980 (it should be dropped from our DB...)
        df = df.query(
            '~((location_id == 433 | location_id == 434) & year_id == 1980)')

        # get rid of other maternal VR
        df = df.query('~(source == "Other_Maternal" & location_id == 38)')

        # collapse to remove extract type id
        group_cols = self.geo_cols + self.source_cols + \
            ['age_group_id', 'sex_id'] + self.meta_cols
        df = df.groupby(group_cols, as_index=False)[self.value_cols].sum()

        return df
Exemple #5
0
def pull_vr_old(cause_ids, start_year, end_year):

    assert type(start_year) is int, "Starting year must be an integer"
    assert type(end_year) is int, "Ending year must be an integer"
    assert end_year >= start_year, "End year must be greater than or equal to starting year"
    year_ids = list(range(start_year, end_year + 1))
    if type(cause_ids) is int:
        cause_ids = [cause_ids]
    assert type(
        cause_ids) is list, "Cause IDs must be passed as a list of ints"

    vr_old = get_claude_data("disaggregation",
                             year_id=year_ids,
                             data_type_id=9,
                             verbose=True,
                             source="ICD7A",
                             location_set_id=35,
                             exec_function=subset_causes,
                             exec_function_args=[cause_ids])
    return vr_old
def main(nid, extract_type_id, code_system_id, launch_set_id, remove_decimal):
    """Main method"""

    start_time = time.time()
    df = get_claude_data(
        "disaggregation", nid=nid, extract_type_id=extract_type_id
    )

    data_type_id = get_value_from_nid(nid, 'data_type_id', extract_type_id=extract_type_id)
    iso3 = get_value_from_nid(nid, 'iso3', extract_type_id=extract_type_id)

    df = run_pipeline(df, nid, extract_type_id, code_system_id, remove_decimal, data_type_id, iso3)

    run_time = time.time() - start_time
    print_log_message("Finished in {} seconds".format(run_time))

    write_phase_output(
        df, "misdiagnosiscorrection", nid, extract_type_id, launch_set_id
    )
    return df
def pull_vr_data_for_rdp_reg(reg_spec, location_hierarchy, data_id, small_test=False,
                             vr_pull_timestamp=None):
    """Pull vr used to make redistribution proportions.

    If vr_pull_timestamp is passed, and it does in fact exist, then this will
    just read that. Otherwise, it runs a custom get_claude_data based on
    the passed regression specification.
    """

    shared_package_id = reg_spec['shared_package_id']

    if vr_pull_timestamp is not None:
        timestamp = vr_pull_timestamp
    else:
        timestamp = cod_timestamp()

    outdir = "FILEPATH".format(RDP_REG_DIR, shared_package_id)
    outpath = "FILEPATH".format(outdir, data_id, timestamp)

    if vr_pull_timestamp is not None:
        print_log_message("Reading VR data pulled on {}".format(vr_pull_timestamp))
        if not os.path.exists(outpath):
            raise ValueError(
                "Passed [vr_pull_timestamp={}], but {} does not exist. "
                "Need to either pass a different version that does exist, or"
                " run a new vr pull by passing vr_pull_timestamp=None.".format(
                    vr_pull_timestamp, outpath)
            )
        df = pd.read_csv(outpath)

    else:
        print_log_message(
            "Pulling a fresh version of VR with timestamp {}".format(
                timestamp)
        )
        # regressions only use detailed code systems
        code_system_id = MODEL_DATA_CODE_SYSTEMS

        # regressions only use national-level data to avoid biasing the sample
        # toward subnational datasets
        country_loc_map = get_country_loc_id_map(location_hierarchy)

        if small_test:
            year_id = [2010, 2011]
            print("Pulling data for year subset: {}".format(year_id))
        else:
            year_id = range(1980, 2018)

        dfs = []
        for code_system_id in code_system_id:
            print_log_message("Code system id: {}".format(code_system_id))
            garbage_code_ids = get_package_code_ids(reg_spec, code_system_id)
            target_cause_ids = reg_spec['target_cause_ids']
            df = get_claude_data(
                "disaggregation",
                data_type_id=9, code_system_id=code_system_id,
                is_active=True, year_id=year_id, location_set_id=35,
                exec_function=collapse_to_reg_df,
                exec_function_args=[garbage_code_ids, target_cause_ids, country_loc_map],
                attach_launch_set_id=True
            )
            dfs.append(df)
        df = pd.concat(dfs, ignore_index=True)

        df['vr_pull_timestamp'] = timestamp

        df.to_csv(outpath, index=False)

    return df
def run_phase(nid, extract_type_id, launch_set_id, data_type_id, source,
              model_group, malaria_model_group):
    """Chain together computational elements for NR."""
    # types that take noise reduction
    # maternal logic: noise reduce household surveys (in Other_Maternal source)
    # and sources in maternal_nr_sources
    run_noise_reduction = True
    run_by_cause = False

    # determine above values using the source and model group
    if model_group == "NO_NR":
        run_noise_reduction = False

    if model_group_is_run_by_cause(model_group):
        run_by_cause = True

    if run_noise_reduction:
        if run_by_cause:

            filepath = "FILEPATH".format(nr=NR_DIR,
                                         model_group=model_group,
                                         lsid=launch_set_id)
            # NR ran by cause, so pick up the causes that ran and read em
            causes = sorted(list(pd.read_csv(filepath)['cause_id'].unique()))
            print_log_message("Reading cause-specific files".format(
                len(causes)))
        else:
            # cause_id = None means all causes
            causes = [None]
            print_log_message("Reducing cause-appended file")

        # when causes = [None], this will not actually run by cause, as there
        # will be one entry in the loop and it will do all the causes. The
        # pd.concat only adds like .7 seconds to the total time as opposed to
        # running outside of a loop (from some testing in a notebook)
        dfs = []
        for cause_id in causes:
            df = get_noise_reduction_model_result(nid,
                                                  extract_type_id,
                                                  launch_set_id,
                                                  model_group,
                                                  malaria_model_group,
                                                  cause_id=cause_id)
            if 'Unnamed: 0' in df.columns:
                df = df.drop('Unnamed: 0', axis=1)
            dfs.append(df)
        df = pd.concat(dfs, ignore_index=True)

        print_log_message(
            "Running bayesian noise reduction algorithm using fitted priors")
        noise_reducer = NoiseReducer(data_type_id, source)
        df = noise_reducer.get_computed_dataframe(df)

    else:
        # simply get the aggregated result
        print_log_message("Skipping noise reduction for source {} and model "
                          "group {}".format(source, model_group))
        df = get_claude_data("aggregation",
                             nid=nid,
                             extract_type_id=extract_type_id)

    return df
Exemple #9
0
def get_model_data(model_group, location_hierarchy,
                   location_set_version_id, cause_meta_df):
    """Get data to run in NR model with incoming data."""
    iso3s = location_hierarchy.query('level == 3')['ihme_loc_id'].unique()
    regions = location_hierarchy.query('level == 2')['ihme_loc_id'].unique()
    super_region_ids = location_hierarchy.query(
        'level == 1')['location_id'].unique()

    # need to be string for later test that what comes after "VA-" is a
    # super region (otherwise, would have to compare ints, and whats after
    # "VA-" might not be convertible to an int)
    super_region_ids = [str(s) for s in super_region_ids]
    super_region_to_region_ids = location_hierarchy.query('level == 2')

    # location id here is the region id, and parent id is the super region id
    # becomes a dictionary from super region id to list of region ids
    super_region_to_region_ids = (
        super_region_to_region_ids[['location_id', 'parent_id']].groupby(
            'parent_id'
        ).apply(lambda df: list(set(df['location_id']))).to_dict()
    )

    regions_to_ids = location_hierarchy.query(
        'level == 2').set_index('ihme_loc_id')['region_id']

    level_three_location_ids = location_hierarchy.query(
        'level == 3')['location_id'].unique()

    model_group_filters = {}

    bad_model_group = False
    if model_group.startswith("VR-"):
        model_group_filters['data_type_id'] = [9, 10]
        loc_code = model_group.replace("VR-", "")
        if loc_code in iso3s:
            model_group_filters['iso3'] = loc_code
        elif loc_code in regions:
            region_id = regions_to_ids[loc_code]
            model_group_filters['region_id'] = region_id
            model_group_filters['exec_function'] = restrict_to_location_ids
            model_group_filters['exec_function_args'] = [
                level_three_location_ids
            ]
        elif loc_code == "GRL-AK":
            AK_LOC_ID = 524
            GRL_LOC_ID = 349
            model_group_filters['location_id'] = [AK_LOC_ID, GRL_LOC_ID]
        else:
            bad_model_group = True
    elif model_group.startswith("VA-"):
        model_group_filters['data_type_id'] = [8, 12]
        if model_group == "VA-SRS-IND":
            model_group_filters['source'] = IND_SRS_SOURCES
        elif model_group == "VA-SRS-IDN":
            model_group_filters['source'] = IDN_SRS_SOURCES
        elif model_group == "VA-Matlab":
            model_group_filters['source'] = MATLAB_SOURCES
        elif model_group == "VA-Nepal-Burden":
            model_group_filters['source'] = "Nepal_Burden_VA"
        elif model_group == "VA-IND":
            model_group_filters['iso3'] = "IND"
        elif model_group == "VA-158":
            # potential bug from GBD2016 - super region 158 keeps only
            # Pakistan, Nepal, and Bangledesh, doesn't get India data
            # Also keep Bhutan in case we ever have VA there
            model_group_filters['iso3'] = ['PAK', 'NPL', 'BGD', 'BTN']
        else:
            loc_code = model_group.replace("VA-", "")
            if loc_code in super_region_ids:
                super_region_id = int(loc_code)
                model_group_filters['region_id'] = \
                    super_region_to_region_ids[super_region_id]
            else:
                bad_model_group = True

    elif model_group == "Cancer_Registry":
        model_group_filters['source'] = "Cancer_Registry"

    # keep data by source/iso3/survey type
    # model groups follow MATERNAL-{source}-{iso3} format
    # except for the household surveys within Other_Maternal
    elif model_group.startswith("MATERNAL"):
        for source in MATERNAL_NR_SOURCES:
            if source in model_group:
                model_group_filters['source'] = source
        if "HH_SURVEYS" in model_group:
            model_group_filters['survey_type'] = ["DHS", "RHS", "AHS",
                                                  "DLHS", "NFHS"]
        model_group_filters['iso3'] = model_group[-3:]

    # special malaria model groups for VA data
    elif model_group.startswith('malaria'):
        model_group_filters['data_type_id'] = [8, 12]
        model_group_filters['malaria_model_group'] = model_group
        if "IND_SRS" in model_group:
            model_group_filters['source'] = IND_SRS_SOURCES
    elif model_group == "CHAMPS":
        model_group_filters['data_type_id'] = [12]
    else:
        bad_model_group = True
    if bad_model_group:
        raise AssertionError(
            "Unrecognized model group: {}".format(bad_model_group)
        )

    model_df = get_claude_data(
        phase="aggregation",
        is_active=True,
        is_dropped=False,
        location_set_id=35,
        year_id=range(1980, 2050),
        assert_all_available=True,
        location_set_version_id=location_set_version_id,
        **model_group_filters
    )

    add_cols = ['code_system_id']

    if model_group.startswith(("VA", "MATERNAL", "malaria", "CHAMPS")) or \
            model_group in ["VR-RUS", "VR-R9"]:
        add_cols.append('source')

    if model_group.startswith('MATERNAL-HH_SURVEYS'):
        model_df = add_survey_type(model_df)

    # add on code_system_id
    model_df = add_nid_metadata(
        model_df, add_cols, force_rerun=False, block_rerun=True,
        cache_dir='standard', cache_results=False
    )
    if model_group == "VR-RUS" or model_group == "VR-R9":
        # treat this like Russia_FMD_1989_1998 for purpose of cause list,
        # as it has now been bridge mapped that way
        replace_source = "Russia_FMD_ICD9"
        replace_csid = 213
        fmd_conv_10 = model_df['source'] == replace_source
        num_replace = len(model_df[fmd_conv_10])
        assert num_replace > 0, \
            "No rows found with source {} in " \
            "model group {}".format(replace_source, model_group)
        print_log_message(
            "Setting code system to {cs} for {s} "
            "source: {n} rows changed".format(
                cs=replace_csid, s=replace_source, n=num_replace)
        )
        model_df.loc[fmd_conv_10, 'code_system_id'] = replace_csid

    report_if_merge_fail(
        model_df, 'code_system_id', ['nid', 'extract_type_id']
    )

    # special source drops for certain groups
    model_df = drop_source_data(model_df, model_group, location_hierarchy,
                                cause_meta_df)

    return model_df
Exemple #10
0
def get_model_data(model_group, location_hierarchy, location_set_version_id,
                   cause_meta_df):
    """Get data to run in NR model with incoming data."""
    iso3s = location_hierarchy.query('level == 3')['ihme_loc_id'].unique()
    regions = location_hierarchy.query('level == 2')['ihme_loc_id'].unique()
    super_region_ids = location_hierarchy.query(
        'level == 1')['location_id'].unique()

    super_region_ids = [str(s) for s in super_region_ids]
    super_region_to_region_ids = location_hierarchy.query('level == 2')

    super_region_to_region_ids = (super_region_to_region_ids[[
        'location_id', 'parent_id'
    ]].groupby('parent_id').apply(
        lambda df: list(set(df['location_id']))).to_dict())

    regions_to_ids = location_hierarchy.query('level == 2').set_index(
        'ihme_loc_id')['region_id']

    level_three_location_ids = location_hierarchy.query(
        'level == 3')['location_id'].unique()

    model_group_filters = {}

    bad_model_group = False
    if model_group.startswith("VR-"):
        model_group_filters['data_type_id'] = [9, 10]
        loc_code = model_group.replace("VR-", "")
        if loc_code in iso3s:
            model_group_filters['iso3'] = loc_code
        elif loc_code in regions:
            region_id = regions_to_ids[loc_code]
            model_group_filters['region_id'] = region_id
            model_group_filters['exec_function'] = restrict_to_location_ids
            model_group_filters['exec_function_args'] = [
                level_three_location_ids
            ]
        elif loc_code == "GRL-AK":
            AK_LOC_ID = 524
            GRL_LOC_ID = 349
            model_group_filters['location_id'] = [AK_LOC_ID, GRL_LOC_ID]
        else:
            bad_model_group = True
    elif model_group.startswith("VA-"):
        model_group_filters['data_type_id'] = 8
        if model_group == "VA-SRS-IND":
            model_group_filters['source'] = IND_SRS_SOURCES
        elif model_group == "VA-SRS-IDN":
            model_group_filters['source'] = IDN_SRS_SOURCES
        elif model_group == "VA-Matlab":
            model_group_filters['source'] = MATLAB_SOURCES
        elif model_group == "VA-IND":
            model_group_filters['iso3'] = "IND"
        elif model_group == "VA-158":
            model_group_filters['iso3'] = ['PAK', 'NPL', 'BGD']
        else:
            loc_code = model_group.replace("VA-", "")
            if loc_code in super_region_ids:
                super_region_id = int(loc_code)
                model_group_filters['region_id'] = \
                    super_region_to_region_ids[super_region_id]
            else:
                bad_model_group = True

    elif model_group == "Cancer_Registry":
        model_group_filters['source'] = "Cancer_Registry"

    elif model_group.startswith("MATERNAL"):
        for source in MATERNAL_NR_SOURCES:
            if source in model_group:
                model_group_filters['source'] = source
        if "HH_SURVEYS" in model_group:
            model_group_filters['survey_type'] = [
                "DHS", "RHS", "AHS", "DLHS", "NFHS"
            ]
        model_group_filters['iso3'] = model_group[-3:]

    elif model_group.startswith('malaria'):
        model_group_filters['data_type_id'] = 8
        model_group_filters['malaria_model_group'] = model_group
        if "IND_SRS" in model_group:
            model_group_filters['source'] = IND_SRS_SOURCES
    else:
        bad_model_group = True
    if bad_model_group:
        raise AssertionError(
            "Unrecognized model group: {}".format(bad_model_group))

    model_df = get_claude_data(phase="aggregation",
                               is_active=True,
                               is_dropped=False,
                               location_set_id=35,
                               year_id=range(1980, 2050),
                               assert_all_available=True,
                               location_set_version_id=location_set_version_id,
                               **model_group_filters)

    add_cols = ['code_system_id']

    if model_group.startswith("VA") or model_group.startswith("MATERNAL") or \
            model_group in ["VR-RUS", "VR-R9"] or model_group.startswith('malaria'):
        add_cols.append('source')

    if model_group.startswith('MATERNAL-HH_SURVEYS'):
        model_df = add_survey_type(model_df)

    # add on code_system_id
    model_df = add_nid_metadata(model_df,
                                add_cols,
                                force_rerun=False,
                                block_rerun=True,
                                cache_dir='standard',
                                cache_results=False)
    if model_group == "VR-RUS" or model_group == "VR-R9":

        replace_source = "Russia_FMD_ICD9"
        replace_csid = 213
        fmd_conv_10 = model_df['source'] == replace_source
        num_replace = len(model_df[fmd_conv_10])
        assert num_replace > 0, \
            "No rows found with source {} in " \
            "model group {}".format(replace_source, model_group)
        print_log_message("Setting code system to {cs} for {s} "
                          "source: {n} rows changed".format(cs=replace_csid,
                                                            s=replace_source,
                                                            n=num_replace))
        model_df.loc[fmd_conv_10, 'code_system_id'] = replace_csid

    report_if_merge_fail(model_df, 'code_system_id',
                         ['nid', 'extract_type_id'])

    # special source drops for certain groups
    model_df = drop_source_data(model_df, model_group, location_hierarchy,
                                cause_meta_df)

    return model_df