예제 #1
0
    def __init__(self):
        self.cg = Configurator('standard')
        self.cache_dir = self.cg.get_directory('db_cache')
        # if you do not want to write any output files then set test to "True"
        self.test = False
        self.cache_options = {
            'force_rerun': True,
            'block_rerun': False,
            'cache_dir': self.cache_dir
        }
        self.dataset_filters = {
            'data_type_id': [8, 9, 10, 12],
            'location_set_id': 35,
            'is_active': True,
            'year_id': range(1980, 2050)
        }
        self.national_nids = self.cg.get_resource("nid_replacements")

        # resources
        self.completeness = self.cg.get_resource("completeness")
        self.env_meta_df = get_env(env_run_id=self.cg.get_id('env_run'),
                                   **self.cache_options)
        self.location_meta_df = get_current_location_hierarchy(
            location_set_version_id=self.cg.get_id('location_set_version'),
            **self.cache_options)
        self.cod_ages = list(
            get_cod_ages(**self.cache_options)['age_group_id'].unique())

        # identifiers
        self.source_cols = ["source", "nid", "data_type_id"]
        self.geo_cols = ["location_id", "year_id"]
        self.meta_cols = ["nationally_representative", "detail_level_id"]
        self.value_cols = ['deaths']
        self.year_end = self.cg.get_id('year_end')
        self.full_time_series = "full_time_series"

        # directories
        self.current_best_version = "2018_04_03_151739"
        self.out_dir = "FILEPATH"
        self.arch_dir = "{}/_archive".format(self.out_dir)
        self.timestamp = cod_timestamp()
예제 #2
0
    def get_age_weight_df(self):
        """
        We have shifted to pulling age weights based on mortality information after a
        decision by USERNAME and USERNAME. The method below replaces pulling the population
        based weights out of the db with the "get_age_weights" function. - 07/10/2019
        """
        df = get_env(env_run_id=self.cg.get_id('env_run'),
                     force_rerun=False,
                     block_rerun=True)
        # get global, both sex, for all years after 2010
        df = df.query("location_id == 1 & sex_id == 3 & year_id >= 2010")
        # collapse out year
        df = df.groupby(['age_group_id', 'location_id', 'sex_id'],
                        as_index=False).mean_env.sum()
        # total deaths for weights
        total = df.loc[df.age_group_id == 22]['mean_env'].iloc[0]
        # get the ages we care about (cod ages, under 1, and 80+)
        age_df = get_cod_ages()
        ages = age_df.age_group_id.unique().tolist()
        ages += [21, 28]
        # limit env df to relevant ages
        df = df.loc[df.age_group_id.isin(ages)]
        # group by age, and then make weights
        df = df.groupby('age_group_id', as_index=False).mean_env.sum()
        df['weight'] = df['mean_env'] / total
        # some renaming
        df.rename(columns={'weight': 'age_group_weight_value'}, inplace=True)

        # do a quick check to make sure the death totals used to create weights are sensible
        # just making sure age specific totals are within 1% of the all age total
        check_val = abs(
            (df.loc[~df.age_group_id.isin([21, 28])].mean_env.sum() / total) -
            1)
        assert check_val < 0.01

        df = df[['age_group_id', 'age_group_weight_value']]
        return df
def run_pipeline(nid,
                 extract_type_id,
                 launch_set_id,
                 df,
                 code_system_id,
                 cause_set_version_id,
                 location_set_version_id,
                 pop_run_id,
                 env_run_id,
                 distribution_set_version_id,
                 diagnostic=False):
    """Run the full pipeline"""

    cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_results': False,
        'cache_dir': CONF.get_directory('FILEPATH'),
        'verbose': False
    }

    location_meta_df = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id, **cache_options)

    code_map = get_cause_map(code_system_id=code_system_id, **cache_options)

    source = get_value_from_nid(nid, "source", extract_type_id)

    print("Overriding causes when necessary")
    df = overrides(df, location_meta_df)

    print("Dropping data out of scope")
    df = drop_data_out_of_scope(df, location_meta_df, source)
    if len(df) > 0:
        # make sure six minor territories are grouped correctly
        assert_no_six_minor_territories(df)

        # run mapping
        print("\nDeaths before MAPPING: {}".format(df.deaths.sum()))
        Mapper = GBDCauseMapper(cause_set_version_id, code_map)
        df = Mapper.get_computed_dataframe(df, code_system_id)
        if diagnostic:
            write_phase_output(df,
                               'mapping',
                               nid,
                               extract_type_id,
                               launch_set_id,
                               sub_dirs='diagnostic')

        print("\nDeaths before AGESEXSPLIT: {}".format(df.deaths.sum()))
        # run age sex splitting
        MySplitter = AgeSexSplitter(cause_set_version_id,
                                    pop_run_id,
                                    distribution_set_version_id,
                                    verbose=True,
                                    collect_diagnostics=False)

        df = MySplitter.get_computed_dataframe(df, location_meta_df)
        if diagnostic:
            diag_df = MySplitter.get_diagnostic_dataframe()
            write_phase_output(diag_df,
                               'agesexsplit',
                               nid,
                               extract_type_id,
                               launch_set_id,
                               sub_dirs='diagnostic')

        print("\nDeaths before CORRECTIONS: {}".format(df.deaths.sum()))
        # run restrictions corrections
        Corrector = RestrictionsCorrector(code_system_id,
                                          cause_set_version_id,
                                          collect_diagnostics=False,
                                          verbose=True)
        df = Corrector.get_computed_dataframe(df)

        # calculate cc_code for some sources
        if source in ['Iran_maternal_surveillance', 'Iran_forensic']:
            env_meta_df = get_env(env_run_id=env_run_id, **cache_options)
            df = calculate_cc_code(df, env_meta_df, code_map)
            print("\nDeaths after adding cc_code: {}".format(df.deaths.sum()))

        # adjust deaths for New Zealand by maori/non-maori ethnicities
        if source in ["NZL_MOH_ICD9", "NZL_MOH_ICD10"]:
            df = correct_maori_non_maori_deaths(df)
            print("\nDeaths after Maori/non-Maori adjustment: {}".format(
                df.deaths.sum()))

        print("\nDeaths at END: {}".format(df.deaths.sum()))

    return df
예제 #4
0
def run_phase(df, nid, extract_type_id, env_run_id, pop_run_id,
              location_set_version_id, cause_set_version_id):
    cache_dir = CONF.get_directory('db_cache')
    source = get_value_from_nid(
        nid,
        'source',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)
    data_type_id = get_value_from_nid(
        nid,
        'data_type_id',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)
    iso3 = get_value_from_nid(nid,
                              'iso3',
                              extract_type_id=extract_type_id,
                              location_set_version_id=location_set_version_id)
    standard_cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_dir': cache_dir,
        'cache_results': False
    }

    # ************************************************************
    # Get cached metadata
    # ************************************************************
    print_log_message("Getting cached db resources")
    location_hierarchy = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id,
        **standard_cache_options)
    pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options)
    env_df = get_env(env_run_id=env_run_id, **standard_cache_options)
    age_weight_df = get_age_weights(**standard_cache_options)
    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **standard_cache_options)
    age_meta_df = get_ages(**standard_cache_options)

    # ************************************************************
    # RAKING
    # ************************************************************
    # Rake if appropriate based on this logic
    if ((data_type_id in [8, 9, 10] and (source != 'Other_Maternal'))
            or source in MATERNAL_NR_SOURCES):
        if source not in NOT_RAKED_SOURCES:
            print_log_message("Raking sub national estimates")
            raker = Raker(df, source)
            df = raker.get_computed_dataframe(location_hierarchy)

    # for the Other_Maternal source we only rake household surveys
    elif source == "Other_Maternal":
        model_groups = get_datasets(nid,
                                    extract_type_id,
                                    block_rerun=True,
                                    force_rerun=False).model_group.unique()
        assert len(model_groups) == 1
        model_group = model_groups[0]

        if "HH_SURVEYS" in model_group:
            if model_group == 'MATERNAL-HH_SURVEYS-IND':
                print_log_message("Raking sub national estimates," \
                    " applying double raking for India Maternal"
                )
                raker = Raker(df, source, double=True)
                df = raker.get_computed_dataframe(location_hierarchy)
            else:
                print_log_message("Raking sub national estimates")
                raker = Raker(df, source)
                df = raker.get_computed_dataframe(location_hierarchy)


# ************************************************************
# DROP ZERO SAMPLE SIZE AND RESTRICTED AGE/SEX DATA
# ************************************************************

# data with zero sample size is almost certaintly some anomolous result
# of a program generating data it shouldn't have, and it shouldn't be
# included in codem models. Was probably already dropped, anyway, before
# running noise reduction.
    df = df.query('sample_size != 0')

    # uploading data before 1980 is a waste of space because neither codem
    # nor codviz use it
    df = df.loc[df['year_id'] >= 1980]

    print_log_message("Enforcing age sex restrictions")
    # this actually drops data from the dataframe if it violates age/sex
    # restrictions (e.g. male maternity disorders)
    df = enforce_asr(df, cause_meta_df, age_meta_df)

    # ************************************************************
    # FIT EACH DRAW TO NON-ZERO FLOOR
    # ************************************************************

    print_log_message("Fitting to non-zero floor...")
    nonzero_floorer = NonZeroFloorer(df)
    df = nonzero_floorer.get_computed_dataframe(pop_df, env_df, cause_meta_df)

    # ************************************************************
    # AGE AGGREGATION
    # ************************************************************

    print_log_message("Creating age standardized and all ages groups")
    age_aggregator = AgeAggregator(df, pop_df, env_df, age_weight_df)
    df = age_aggregator.get_computed_dataframe()

    # ************************************************************
    # Make CODEm and CoDViz metrics for uncertainty
    # ************************************************************
    # columns that should be present in the phase output
    final_cols = [
        'age_group_id', 'cause_id', 'cf_corr', 'cf_final', 'cf_raw', 'cf_rd',
        'extract_type_id', 'location_id', 'nid', 'sample_size', 'sex_id',
        'site_id', 'year_id'
    ]
    # Use draws to make metrics for uncertainty to
    # be used by CODEm and CoDViz
    # also creates cf_final from mean of draws
    print_log_message("Making metrics for CODEm and CoDViz")
    if dataset_has_redistribution_variance(data_type_id, source):
        df = RedistributionVarianceEstimator.make_codem_codviz_metrics(
            df, pop_df)
        final_cols += [
            'cf_final_high_rd', 'cf_final_low_rd', 'variance_rd_log_dr',
            'variance_rd_logit_cf'
        ]

    # we did this in the old code-- no cfs over 1 nor below 0
    for cf_col in ['cf_final', 'cf_rd', 'cf_raw', 'cf_corr']:
        df.loc[df[cf_col] > 1, cf_col] = 1
        df.loc[df[cf_col] < 0, cf_col] = 0

    df = df[final_cols]

    return df
예제 #5
0
def run_phase(df, nid, extract_type_id, env_run_id, pop_run_id,
              location_set_version_id, cause_set_version_id):
    cache_dir = CONF.get_directory('db_cache')
    source = get_value_from_nid(
        nid,
        'source',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)
    data_type_id = get_value_from_nid(
        nid,
        'data_type_id',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)
    standard_cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_dir': cache_dir,
        'cache_results': False
    }

    # ************************************************************
    # Get cached metadata
    # ************************************************************
    print_log_message("Getting cached db resources")
    location_hierarchy = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id,
        **standard_cache_options)
    pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options)
    env_df = get_env(env_run_id=env_run_id, **standard_cache_options)
    age_weight_df = get_age_weights(**standard_cache_options)
    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **standard_cache_options)
    age_meta_df = get_ages(**standard_cache_options)

    # ************************************************************
    # RAKING
    # ************************************************************
    if ((data_type_id in [8, 9, 10] and (source != "Other_Maternal"))
            or source in MATERNAL_NR_SOURCES):
        if source not in NOT_RAKED_SOURCES:
            print_log_message("Raking sub national estimates")
            raker = Raker(df, source)
            df = raker.get_computed_dataframe(location_hierarchy)

    # for the Other_Maternal source we only rake household surveys
    elif source == "Other_Maternal":
        model_groups = get_datasets(nid,
                                    extract_type_id,
                                    block_rerun=True,
                                    force_rerun=False).model_group.unique()
        assert len(model_groups) == 1
        model_group = model_groups[0]

        if "HH_SURVEYS" in model_group:
            print_log_message("Raking sub national estimates")
            raker = Raker(df, source)
            df = raker.get_computed_dataframe(location_hierarchy)


# ************************************************************
# DROP ZERO SAMPLE SIZE AND RESTRICTED AGE/SEX DATA
# ************************************************************
    df = df.query('sample_size != 0')

    df = df.loc[df['year_id'] >= 1980]

    print_log_message("Enforcing age sex restrictions")

    df = enforce_asr(df, cause_meta_df, age_meta_df)

    # ************************************************************
    # FIT EACH DRAW TO NON-ZERO FLOOR
    # ************************************************************

    print_log_message("Fitting to non-zero floor...")
    nonzero_floorer = NonZeroFloorer(df)
    df = nonzero_floorer.get_computed_dataframe(pop_df, env_df, cause_meta_df)

    # ************************************************************
    # AGE AGGREGATION
    # ************************************************************

    print_log_message("Creating age standardized and all ages groups")
    age_aggregator = AgeAggregator(df, pop_df, env_df, age_weight_df)
    df = age_aggregator.get_computed_dataframe()

    # ************************************************************
    # Make CODEm and CoDViz metrics for uncertainty
    # ************************************************************
    # columns that should be present in the phase output
    final_cols = [
        'age_group_id', 'cause_id', 'cf_corr', 'cf_final', 'cf_raw', 'cf_rd',
        'extract_type_id', 'location_id', 'nid', 'sample_size', 'sex_id',
        'site_id', 'year_id'
    ]

    print_log_message("Making metrics for CODEm and CoDViz")
    if dataset_has_redistribution_variance(data_type_id, source):
        df = RedistributionVarianceEstimator.make_codem_codviz_metrics(
            df, pop_df)
        final_cols += [
            'cf_final_high_rd', 'cf_final_low_rd', 'variance_rd_log_dr',
            'variance_rd_logit_cf'
        ]

    for cf_col in ['cf_final', 'cf_rd', 'cf_raw', 'cf_corr']:
        df.loc[df[cf_col] > 1, cf_col] = 1
        df.loc[df[cf_col] < 0, cf_col] = 0

    df = df[final_cols]

    return df