def __init__(self): self.cg = Configurator('standard') self.cache_dir = self.cg.get_directory('db_cache') # if you do not want to write any output files then set test to "True" self.test = False self.cache_options = { 'force_rerun': True, 'block_rerun': False, 'cache_dir': self.cache_dir } self.dataset_filters = { 'data_type_id': [8, 9, 10, 12], 'location_set_id': 35, 'is_active': True, 'year_id': range(1980, 2050) } self.national_nids = self.cg.get_resource("nid_replacements") # resources self.completeness = self.cg.get_resource("completeness") self.env_meta_df = get_env(env_run_id=self.cg.get_id('env_run'), **self.cache_options) self.location_meta_df = get_current_location_hierarchy( location_set_version_id=self.cg.get_id('location_set_version'), **self.cache_options) self.cod_ages = list( get_cod_ages(**self.cache_options)['age_group_id'].unique()) # identifiers self.source_cols = ["source", "nid", "data_type_id"] self.geo_cols = ["location_id", "year_id"] self.meta_cols = ["nationally_representative", "detail_level_id"] self.value_cols = ['deaths'] self.year_end = self.cg.get_id('year_end') self.full_time_series = "full_time_series" # directories self.current_best_version = "2018_04_03_151739" self.out_dir = "FILEPATH" self.arch_dir = "{}/_archive".format(self.out_dir) self.timestamp = cod_timestamp()
def get_age_weight_df(self): """ We have shifted to pulling age weights based on mortality information after a decision by USERNAME and USERNAME. The method below replaces pulling the population based weights out of the db with the "get_age_weights" function. - 07/10/2019 """ df = get_env(env_run_id=self.cg.get_id('env_run'), force_rerun=False, block_rerun=True) # get global, both sex, for all years after 2010 df = df.query("location_id == 1 & sex_id == 3 & year_id >= 2010") # collapse out year df = df.groupby(['age_group_id', 'location_id', 'sex_id'], as_index=False).mean_env.sum() # total deaths for weights total = df.loc[df.age_group_id == 22]['mean_env'].iloc[0] # get the ages we care about (cod ages, under 1, and 80+) age_df = get_cod_ages() ages = age_df.age_group_id.unique().tolist() ages += [21, 28] # limit env df to relevant ages df = df.loc[df.age_group_id.isin(ages)] # group by age, and then make weights df = df.groupby('age_group_id', as_index=False).mean_env.sum() df['weight'] = df['mean_env'] / total # some renaming df.rename(columns={'weight': 'age_group_weight_value'}, inplace=True) # do a quick check to make sure the death totals used to create weights are sensible # just making sure age specific totals are within 1% of the all age total check_val = abs( (df.loc[~df.age_group_id.isin([21, 28])].mean_env.sum() / total) - 1) assert check_val < 0.01 df = df[['age_group_id', 'age_group_weight_value']] return df
def run_pipeline(nid, extract_type_id, launch_set_id, df, code_system_id, cause_set_version_id, location_set_version_id, pop_run_id, env_run_id, distribution_set_version_id, diagnostic=False): """Run the full pipeline""" cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_results': False, 'cache_dir': CONF.get_directory('FILEPATH'), 'verbose': False } location_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **cache_options) code_map = get_cause_map(code_system_id=code_system_id, **cache_options) source = get_value_from_nid(nid, "source", extract_type_id) print("Overriding causes when necessary") df = overrides(df, location_meta_df) print("Dropping data out of scope") df = drop_data_out_of_scope(df, location_meta_df, source) if len(df) > 0: # make sure six minor territories are grouped correctly assert_no_six_minor_territories(df) # run mapping print("\nDeaths before MAPPING: {}".format(df.deaths.sum())) Mapper = GBDCauseMapper(cause_set_version_id, code_map) df = Mapper.get_computed_dataframe(df, code_system_id) if diagnostic: write_phase_output(df, 'mapping', nid, extract_type_id, launch_set_id, sub_dirs='diagnostic') print("\nDeaths before AGESEXSPLIT: {}".format(df.deaths.sum())) # run age sex splitting MySplitter = AgeSexSplitter(cause_set_version_id, pop_run_id, distribution_set_version_id, verbose=True, collect_diagnostics=False) df = MySplitter.get_computed_dataframe(df, location_meta_df) if diagnostic: diag_df = MySplitter.get_diagnostic_dataframe() write_phase_output(diag_df, 'agesexsplit', nid, extract_type_id, launch_set_id, sub_dirs='diagnostic') print("\nDeaths before CORRECTIONS: {}".format(df.deaths.sum())) # run restrictions corrections Corrector = RestrictionsCorrector(code_system_id, cause_set_version_id, collect_diagnostics=False, verbose=True) df = Corrector.get_computed_dataframe(df) # calculate cc_code for some sources if source in ['Iran_maternal_surveillance', 'Iran_forensic']: env_meta_df = get_env(env_run_id=env_run_id, **cache_options) df = calculate_cc_code(df, env_meta_df, code_map) print("\nDeaths after adding cc_code: {}".format(df.deaths.sum())) # adjust deaths for New Zealand by maori/non-maori ethnicities if source in ["NZL_MOH_ICD9", "NZL_MOH_ICD10"]: df = correct_maori_non_maori_deaths(df) print("\nDeaths after Maori/non-Maori adjustment: {}".format( df.deaths.sum())) print("\nDeaths at END: {}".format(df.deaths.sum())) return df
def run_phase(df, nid, extract_type_id, env_run_id, pop_run_id, location_set_version_id, cause_set_version_id): cache_dir = CONF.get_directory('db_cache') source = get_value_from_nid( nid, 'source', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) data_type_id = get_value_from_nid( nid, 'data_type_id', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) iso3 = get_value_from_nid(nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) standard_cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': cache_dir, 'cache_results': False } # ************************************************************ # Get cached metadata # ************************************************************ print_log_message("Getting cached db resources") location_hierarchy = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options) env_df = get_env(env_run_id=env_run_id, **standard_cache_options) age_weight_df = get_age_weights(**standard_cache_options) cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **standard_cache_options) age_meta_df = get_ages(**standard_cache_options) # ************************************************************ # RAKING # ************************************************************ # Rake if appropriate based on this logic if ((data_type_id in [8, 9, 10] and (source != 'Other_Maternal')) or source in MATERNAL_NR_SOURCES): if source not in NOT_RAKED_SOURCES: print_log_message("Raking sub national estimates") raker = Raker(df, source) df = raker.get_computed_dataframe(location_hierarchy) # for the Other_Maternal source we only rake household surveys elif source == "Other_Maternal": model_groups = get_datasets(nid, extract_type_id, block_rerun=True, force_rerun=False).model_group.unique() assert len(model_groups) == 1 model_group = model_groups[0] if "HH_SURVEYS" in model_group: if model_group == 'MATERNAL-HH_SURVEYS-IND': print_log_message("Raking sub national estimates," \ " applying double raking for India Maternal" ) raker = Raker(df, source, double=True) df = raker.get_computed_dataframe(location_hierarchy) else: print_log_message("Raking sub national estimates") raker = Raker(df, source) df = raker.get_computed_dataframe(location_hierarchy) # ************************************************************ # DROP ZERO SAMPLE SIZE AND RESTRICTED AGE/SEX DATA # ************************************************************ # data with zero sample size is almost certaintly some anomolous result # of a program generating data it shouldn't have, and it shouldn't be # included in codem models. Was probably already dropped, anyway, before # running noise reduction. df = df.query('sample_size != 0') # uploading data before 1980 is a waste of space because neither codem # nor codviz use it df = df.loc[df['year_id'] >= 1980] print_log_message("Enforcing age sex restrictions") # this actually drops data from the dataframe if it violates age/sex # restrictions (e.g. male maternity disorders) df = enforce_asr(df, cause_meta_df, age_meta_df) # ************************************************************ # FIT EACH DRAW TO NON-ZERO FLOOR # ************************************************************ print_log_message("Fitting to non-zero floor...") nonzero_floorer = NonZeroFloorer(df) df = nonzero_floorer.get_computed_dataframe(pop_df, env_df, cause_meta_df) # ************************************************************ # AGE AGGREGATION # ************************************************************ print_log_message("Creating age standardized and all ages groups") age_aggregator = AgeAggregator(df, pop_df, env_df, age_weight_df) df = age_aggregator.get_computed_dataframe() # ************************************************************ # Make CODEm and CoDViz metrics for uncertainty # ************************************************************ # columns that should be present in the phase output final_cols = [ 'age_group_id', 'cause_id', 'cf_corr', 'cf_final', 'cf_raw', 'cf_rd', 'extract_type_id', 'location_id', 'nid', 'sample_size', 'sex_id', 'site_id', 'year_id' ] # Use draws to make metrics for uncertainty to # be used by CODEm and CoDViz # also creates cf_final from mean of draws print_log_message("Making metrics for CODEm and CoDViz") if dataset_has_redistribution_variance(data_type_id, source): df = RedistributionVarianceEstimator.make_codem_codviz_metrics( df, pop_df) final_cols += [ 'cf_final_high_rd', 'cf_final_low_rd', 'variance_rd_log_dr', 'variance_rd_logit_cf' ] # we did this in the old code-- no cfs over 1 nor below 0 for cf_col in ['cf_final', 'cf_rd', 'cf_raw', 'cf_corr']: df.loc[df[cf_col] > 1, cf_col] = 1 df.loc[df[cf_col] < 0, cf_col] = 0 df = df[final_cols] return df
def run_phase(df, nid, extract_type_id, env_run_id, pop_run_id, location_set_version_id, cause_set_version_id): cache_dir = CONF.get_directory('db_cache') source = get_value_from_nid( nid, 'source', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) data_type_id = get_value_from_nid( nid, 'data_type_id', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) standard_cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': cache_dir, 'cache_results': False } # ************************************************************ # Get cached metadata # ************************************************************ print_log_message("Getting cached db resources") location_hierarchy = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options) env_df = get_env(env_run_id=env_run_id, **standard_cache_options) age_weight_df = get_age_weights(**standard_cache_options) cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **standard_cache_options) age_meta_df = get_ages(**standard_cache_options) # ************************************************************ # RAKING # ************************************************************ if ((data_type_id in [8, 9, 10] and (source != "Other_Maternal")) or source in MATERNAL_NR_SOURCES): if source not in NOT_RAKED_SOURCES: print_log_message("Raking sub national estimates") raker = Raker(df, source) df = raker.get_computed_dataframe(location_hierarchy) # for the Other_Maternal source we only rake household surveys elif source == "Other_Maternal": model_groups = get_datasets(nid, extract_type_id, block_rerun=True, force_rerun=False).model_group.unique() assert len(model_groups) == 1 model_group = model_groups[0] if "HH_SURVEYS" in model_group: print_log_message("Raking sub national estimates") raker = Raker(df, source) df = raker.get_computed_dataframe(location_hierarchy) # ************************************************************ # DROP ZERO SAMPLE SIZE AND RESTRICTED AGE/SEX DATA # ************************************************************ df = df.query('sample_size != 0') df = df.loc[df['year_id'] >= 1980] print_log_message("Enforcing age sex restrictions") df = enforce_asr(df, cause_meta_df, age_meta_df) # ************************************************************ # FIT EACH DRAW TO NON-ZERO FLOOR # ************************************************************ print_log_message("Fitting to non-zero floor...") nonzero_floorer = NonZeroFloorer(df) df = nonzero_floorer.get_computed_dataframe(pop_df, env_df, cause_meta_df) # ************************************************************ # AGE AGGREGATION # ************************************************************ print_log_message("Creating age standardized and all ages groups") age_aggregator = AgeAggregator(df, pop_df, env_df, age_weight_df) df = age_aggregator.get_computed_dataframe() # ************************************************************ # Make CODEm and CoDViz metrics for uncertainty # ************************************************************ # columns that should be present in the phase output final_cols = [ 'age_group_id', 'cause_id', 'cf_corr', 'cf_final', 'cf_raw', 'cf_rd', 'extract_type_id', 'location_id', 'nid', 'sample_size', 'sex_id', 'site_id', 'year_id' ] print_log_message("Making metrics for CODEm and CoDViz") if dataset_has_redistribution_variance(data_type_id, source): df = RedistributionVarianceEstimator.make_codem_codviz_metrics( df, pop_df) final_cols += [ 'cf_final_high_rd', 'cf_final_low_rd', 'variance_rd_log_dr', 'variance_rd_logit_cf' ] for cf_col in ['cf_final', 'cf_rd', 'cf_raw', 'cf_corr']: df.loc[df[cf_col] > 1, cf_col] = 1 df.loc[df[cf_col] < 0, cf_col] = 0 df = df[final_cols] return df