def open_artifact(output_path: Path, location: str) -> Artifact: """Creates or opens an artifact at the output path. Parameters ---------- output_path Fully resolved path to the artifact file. location Proper GBD location name represented by the artifact. Returns ------- A new artifact. """ if not output_path.exists(): logger.debug(f"Creating artifact at {str(output_path)}.") else: logger.debug(f"Opening artifact at {str(output_path)} for appending.") artifact = Artifact(output_path, filter_terms=[get_location_term(location)]) key = EntityKey(project_globals.METADATA_LOCATIONS) if str(key) not in artifact: artifact.write(key, [location]) return artifact
def open_artifact(output_path: Path, location: str) -> Artifact: """Creates or opens an artifact at the output path. Parameters ---------- output_path Fully resolved path to the artifact file. location Proper GBD location name represented by the artifact. Returns ------- A new artifact. """ if not output_path.exists(): logger.debug(f"Creating artifact at {str(output_path)}.") else: logger.debug(f"Opening artifact at {str(output_path)} for appending.") artifact = Artifact(output_path) key = data_keys.METADATA_LOCATIONS if key not in artifact: artifact.write(key, [location]) return artifact
def calc_hypertensive(location, draw): art_path = HYPERTENSION_DATA_FOLDER / f'{location}/data.hdf' art = Artifact(str(art_path), filter_terms=[f'draw=={draw}']) # I can drop indices and know that the means/sds/weights will be aligned b/c we sort the data in vivarium_inputs mean = art.load('risk_factor.high_systolic_blood_pressure.exposure') demographic_index = mean.index # but we'll need it later for the proportions mean = mean.reset_index(drop=True) sd = art.load( 'risk_factor.high_systolic_blood_pressure.exposure_standard_deviation' ).reset_index(drop=True) # these will be the same for all draws weights = prep_weights(art) threshold = pd.Series(HYPERTENSION_THRESHOLD, index=mean.index) dist = EnsembleDistribution(weights=weights, mean=mean[f'draw_{draw}'], sd=sd[f'draw_{draw}']) props = (1 - dist.cdf(threshold)).fillna( 0) # we want the proportion above the threshold props.index = demographic_index props.name = f'draw_{draw}' props = props.droplevel('parameter').fillna(0) return props
def assemble_tobacco_artifacts(num_draws, output_path: Path, seed: int = RANDOM_SEED): """ Assemble the data artifacts required to simulate the various tobacco interventions. Parameters ---------- num_draws The number of random draws to sample for each rate and quantity, for the uncertainty analysis. output_path The path to the artifact being assembled. seed The seed for the pseudo-random number generator used to generate the random samples. """ data_dir_non_maori = get_data_dir('non-maori') data_dir_maori = get_data_dir('maori') prng = np.random.RandomState(seed=seed) logger = logging.getLogger(__name__) # Instantiate components for the non-Maori population. p_nm = Population(data_dir_non_maori, YEAR_START) l_nm = Diseases(data_dir_non_maori, YEAR_START, p_nm.year_end) t_nm = Tobacco(data_dir_non_maori, YEAR_START, p_nm.year_end) # Instantiate components for the Maori population. p_m = Population(data_dir_maori, YEAR_START) l_m = Diseases(data_dir_maori, YEAR_START, p_m.year_end) t_m = Tobacco(data_dir_maori, YEAR_START, p_m.year_end) # Define data structures to record the samples from the unit interval that # are used to sample each rate/quantity, so that they can be correlated # across both populations. smp_yld = prng.random_sample(num_draws) smp_chronic_apc = {} smp_chronic_i = {} smp_chronic_r = {} smp_chronic_f = {} smp_chronic_yld = {} smp_chronic_prev = {} smp_acute_f = {} smp_acute_yld = {} smp_tob_dis_tbl = {} smp_tob_i = prng.random_sample(num_draws) smp_tob_r = prng.random_sample(num_draws) smp_tob_elast = prng.random_sample(num_draws) smp_tob_elast_maori = prng.random_sample(num_draws) # Define the sampling distributions in terms of their family and their # *relative* standard deviation; they will be used to draw samples for # both populations. dist_yld = LogNormal(sd_pcnt=10) dist_chronic_apc = Normal(sd_pcnt=0.5) dist_chronic_i = Normal(sd_pcnt=5) dist_chronic_r = Normal(sd_pcnt=5) dist_chronic_f = Normal(sd_pcnt=5) dist_chronic_yld = Normal(sd_pcnt=10) dist_chronic_prev = Normal(sd_pcnt=5) dist_acute_f = Normal(sd_pcnt=10) dist_acute_yld = Normal(sd_pcnt=10) dist_tob_i = Beta(sd_pcnt=20) dist_tob_r = Beta(sd_pcnt=20) dist_tob_elast = Normal(sd_pcnt=20) dist_tob_elast_maori_scale = Normal(sd_pcnt=10) tob_elast_maori_scale = 1.2 logger.info('{} Generating samples'.format( datetime.datetime.now().strftime("%H:%M:%S"))) for name, disease_nm in l_nm.chronic.items(): # Draw samples for each rate/quantity for this disease. smp_chronic_apc[name] = prng.random_sample(num_draws) smp_chronic_i[name] = prng.random_sample(num_draws) smp_chronic_r[name] = prng.random_sample(num_draws) smp_chronic_f[name] = prng.random_sample(num_draws) smp_chronic_yld[name] = prng.random_sample(num_draws) smp_chronic_prev[name] = prng.random_sample(num_draws) # Also draw samples for the RR associated with tobacco smoking. smp_tob_dis_tbl[name] = prng.random_sample(num_draws) for name, disease_nm in l_nm.acute.items(): # Draw samples for each rate/quantity for this disease. smp_acute_f[name] = prng.random_sample(num_draws) smp_acute_yld[name] = prng.random_sample(num_draws) # Also draw samples for the RR associated with tobacco smoking. smp_tob_dis_tbl[name] = prng.random_sample(num_draws) # Now write all of the required tables for: # # - Both the Maori and non-Maori populations; and # - Both the 20-year and 0-year recovery from smoking. # # So there are 4 data artifacts to write. nm_artifact_fmt = 'mslt_tobacco_non-maori_{}-years.hdf' m_artifact_fmt = 'mslt_tobacco_maori_{}-years.hdf' logger.info('{} Generating artifacts'.format( datetime.datetime.now().strftime("%H:%M:%S"))) for recovery in [20, 0]: nm_artifact_file = output_path / nm_artifact_fmt.format(recovery) m_artifact_file = output_path / m_artifact_fmt.format(recovery) exposure = 'tobacco' # Initialise each artifact file. for path in [nm_artifact_file, m_artifact_file]: if path.exists(): path.unlink() # Write the data tables to each artifact file. art_nm = Artifact(str(nm_artifact_file)) art_m = Artifact(str(m_artifact_file)) logger.info('{} Writing population tables'.format( datetime.datetime.now().strftime("%H:%M:%S"))) # Write the main population tables. write_table(art_nm, 'population.structure', p_nm.get_population()) write_table(art_m, 'population.structure', p_m.get_population()) write_table(art_nm, 'cause.all_causes.disability_rate', p_nm.sample_disability_rate_from(dist_yld, smp_yld)) write_table(art_m, 'cause.all_causes.disability_rate', p_m.sample_disability_rate_from(dist_yld, smp_yld)) write_table(art_nm, 'cause.all_causes.mortality', p_nm.get_mortality_rate()) write_table(art_m, 'cause.all_causes.mortality', p_m.get_mortality_rate()) # Write the chronic disease tables. for name, disease_nm in l_nm.chronic.items(): logger.info('{} Writing tables for {}'.format( datetime.datetime.now().strftime("%H:%M:%S"), name)) disease_m = l_m.chronic[name] write_table( art_nm, 'chronic_disease.{}.incidence'.format(name), disease_nm.sample_i_from(dist_chronic_i, dist_chronic_apc, smp_chronic_i[name], smp_chronic_apc[name])) write_table( art_m, 'chronic_disease.{}.incidence'.format(name), disease_m.sample_i_from(dist_chronic_i, dist_chronic_apc, smp_chronic_i[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.remission'.format(name), disease_nm.sample_r_from(dist_chronic_r, dist_chronic_apc, smp_chronic_r[name], smp_chronic_apc[name])) write_table( art_m, 'chronic_disease.{}.remission'.format(name), disease_m.sample_r_from(dist_chronic_r, dist_chronic_apc, smp_chronic_r[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.mortality'.format(name), disease_nm.sample_f_from(dist_chronic_f, dist_chronic_apc, smp_chronic_f[name], smp_chronic_apc[name])) write_table( art_m, 'chronic_disease.{}.mortality'.format(name), disease_m.sample_f_from(dist_chronic_f, dist_chronic_apc, smp_chronic_f[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.morbidity'.format(name), disease_nm.sample_yld_from(dist_chronic_yld, dist_chronic_apc, smp_chronic_yld[name], smp_chronic_apc[name])) write_table( art_m, 'chronic_disease.{}.morbidity'.format(name), disease_m.sample_yld_from(dist_chronic_yld, dist_chronic_apc, smp_chronic_yld[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.prevalence'.format(name), disease_nm.sample_prevalence_from(dist_chronic_prev, smp_chronic_prev[name])) write_table( art_m, 'chronic_disease.{}.prevalence'.format(name), disease_m.sample_prevalence_from(dist_chronic_prev, smp_chronic_prev[name])) # Write the acute disease tables. for name, disease_nm in l_nm.acute.items(): logger.info('{} Writing tables for {}'.format( datetime.datetime.now().strftime("%H:%M:%S"), name)) disease_m = l_m.acute[name] write_table( art_nm, 'acute_disease.{}.mortality'.format(name), disease_nm.sample_excess_mortality_from( dist_acute_f, smp_acute_f[name])) write_table( art_m, 'acute_disease.{}.mortality'.format(name), disease_m.sample_excess_mortality_from(dist_acute_f, smp_acute_f[name])) write_table( art_nm, 'acute_disease.{}.morbidity'.format(name), disease_nm.sample_disability_from(dist_acute_yld, smp_acute_yld[name])) write_table( art_m, 'acute_disease.{}.morbidity'.format(name), disease_m.sample_disability_from(dist_acute_yld, smp_acute_yld[name])) # Write the risk factor tables. for name in [exposure]: logger.info('{} Writing tables for {}'.format( datetime.datetime.now().strftime("%H:%M:%S"), name)) write_table(art_nm, 'risk_factor.{}.incidence'.format(name), t_nm.sample_i_from(dist_tob_i, smp_tob_i)) write_table(art_m, 'risk_factor.{}.incidence'.format(name), t_m.sample_i_from(dist_tob_i, smp_tob_i)) write_table(art_nm, 'risk_factor.{}.remission'.format(name), t_nm.sample_r_from(dist_tob_r, smp_tob_r)) write_table(art_m, 'risk_factor.{}.remission'.format(name), t_m.sample_r_from(dist_tob_r, smp_tob_r)) if recovery == 0: # Cessation confers immediate recovery. write_table( art_nm, 'risk_factor.{}.mortality_relative_risk'.format(name), collapse_tobacco_mortality_rr( t_nm.get_expected_mortality_rr(), name)) write_table( art_m, 'risk_factor.{}.mortality_relative_risk'.format(name), collapse_tobacco_mortality_rr( t_m.get_expected_mortality_rr(), name)) write_table( art_nm, 'risk_factor.{}.disease_relative_risk'.format(name), collapse_tobacco_disease_rr( t_nm.sample_disease_rr_from(smp_tob_dis_tbl))) write_table( art_m, 'risk_factor.{}.disease_relative_risk'.format(name), collapse_tobacco_disease_rr( t_m.sample_disease_rr_from(smp_tob_dis_tbl))) write_table( art_nm, 'risk_factor.{}.prevalence'.format(name), collapse_tobacco_prevalence(t_nm.get_expected_prevalence(), name)) write_table( art_m, 'risk_factor.{}.prevalence'.format(name), collapse_tobacco_prevalence(t_m.get_expected_prevalence(), name)) else: write_table( art_nm, 'risk_factor.{}.mortality_relative_risk'.format(name), t_nm.get_expected_mortality_rr()) write_table( art_m, 'risk_factor.{}.mortality_relative_risk'.format(name), t_m.get_expected_mortality_rr()) write_table( art_nm, 'risk_factor.{}.disease_relative_risk'.format(name), t_nm.sample_disease_rr_from(smp_tob_dis_tbl)) write_table( art_m, 'risk_factor.{}.disease_relative_risk'.format(name), t_m.sample_disease_rr_from(smp_tob_dis_tbl)) write_table(art_nm, 'risk_factor.{}.prevalence'.format(name), t_nm.get_expected_prevalence()) write_table(art_m, 'risk_factor.{}.prevalence'.format(name), t_m.get_expected_prevalence()) logger.info('{} Tax effects (non-Maori)'.format( datetime.datetime.now().strftime("%H:%M:%S"))) tob_elast_nm = t_nm.sample_price_elasticity_from( dist_tob_elast, smp_tob_elast) incidence_effect_col = 'incidence_effect' remission_effect_col = 'remission_effect' tob_tax_nm = t_nm.sample_tax_effects_from_elasticity_wide( tob_elast_nm) incidence_cols = [ c for c in tob_tax_nm.columns if c != remission_effect_col ] remission_cols = [ c for c in tob_tax_nm.columns if c != incidence_effect_col ] df = tob_tax_nm.loc[:, incidence_cols].rename( columns={incidence_effect_col: 'value'}) write_table(art_nm, 'risk_factor.{}.tax_effect_incidence'.format(name), df) df = tob_tax_nm.loc[:, remission_cols].rename( columns={remission_effect_col: 'value'}) write_table(art_nm, 'risk_factor.{}.tax_effect_remission'.format(name), df) del tob_tax_nm logger.info('{} Tax effects (Maori)'.format( datetime.datetime.now().strftime("%H:%M:%S"))) tob_elast_m = t_m.scale_price_elasticity_from( tob_elast_nm, tob_elast_maori_scale, Normal(sd_pcnt=10), smp_tob_elast_maori) tob_tax_m = t_m.sample_tax_effects_from_elasticity_wide( tob_elast_m) incidence_cols = [ c for c in tob_tax_m.columns if c != remission_effect_col ] remission_cols = [ c for c in tob_tax_m.columns if c != incidence_effect_col ] df = tob_tax_m.loc[:, incidence_cols].rename( columns={incidence_effect_col: 'value'}) write_table(art_m, 'risk_factor.{}.tax_effect_incidence'.format(name), df) df = tob_tax_m.loc[:, remission_cols].rename( columns={remission_effect_col: 'value'}) write_table(art_m, 'risk_factor.{}.tax_effect_remission'.format(name), df) del tob_tax_m del tob_elast_nm del tob_elast_m print(nm_artifact_file) print(m_artifact_file)
def assemble_artifacts(num_draws, output_path: Path, seed: int = RANDOM_SEED): """ Assemble the data artifacts required to simulate the various tobacco interventions. Parameters ---------- num_draws The number of random draws to sample for each rate and quantity, for the uncertainty analysis. output_path The path to the artifact being assembled. seed The seed for the pseudo-random number generator used to generate the random samples. """ data_dir = get_data_dir('data') prng = np.random.RandomState(seed=seed) logger = logging.getLogger(__name__) # Instantiate components for the non-Maori population. pop = Population(data_dir, YEAR_START) diseaseList = Diseases(data_dir, YEAR_START, pop.year_end) # Define data structures to record the samples from the unit interval that # are used to sample each rate/quantity, so that they can be correlated # across both populations. smp_yld = prng.random_sample(num_draws) smp_chronic_apc = {} smp_chronic_i = {} smp_chronic_r = {} smp_chronic_f = {} smp_chronic_yld = {} smp_chronic_prev = {} smp_acute_f = {} smp_acute_yld = {} smp_tob_dis_tbl = {} # Define the sampling distributions in terms of their family and their # *relative* standard deviation; they will be used to draw samples for # both populations. dist_yld = LogNormal(sd_pcnt=10) dist_chronic_apc = Normal(sd_pcnt=0.5) dist_chronic_i = Normal(sd_pcnt=5) dist_chronic_r = Normal(sd_pcnt=5) dist_chronic_f = Normal(sd_pcnt=5) dist_chronic_yld = Normal(sd_pcnt=10) dist_chronic_prev = Normal(sd_pcnt=5) dist_acute_f = Normal(sd_pcnt=10) dist_acute_yld = Normal(sd_pcnt=10) logger.info('{} Generating samples'.format( datetime.datetime.now().strftime("%H:%M:%S"))) for name, disease_nm in diseaseList.chronic.items(): # Draw samples for each rate/quantity for this disease. smp_chronic_apc[name] = prng.random_sample(num_draws) smp_chronic_i[name] = prng.random_sample(num_draws) smp_chronic_r[name] = prng.random_sample(num_draws) smp_chronic_f[name] = prng.random_sample(num_draws) smp_chronic_yld[name] = prng.random_sample(num_draws) smp_chronic_prev[name] = prng.random_sample(num_draws) # Also draw samples for the RR associated with tobacco smoking. smp_tob_dis_tbl[name] = prng.random_sample(num_draws) for name, disease_nm in diseaseList.acute.items(): # Draw samples for each rate/quantity for this disease. smp_acute_f[name] = prng.random_sample(num_draws) smp_acute_yld[name] = prng.random_sample(num_draws) # Also draw samples for the RR associated with tobacco smoking. smp_tob_dis_tbl[name] = prng.random_sample(num_draws) # Now write all of the required tables artifact_fmt = 'pmslt_artifact.hdf' artifact_file = output_path / artifact_fmt logger.info('{} Generating artifacts'.format( datetime.datetime.now().strftime("%H:%M:%S"))) # Initialise each artifact file. for path in [artifact_file]: if path.exists(): path.unlink() # Write the data tables to each artifact file. art_nm = Artifact(str(artifact_file)) logger.info('{} Writing population tables'.format( datetime.datetime.now().strftime("%H:%M:%S"))) # Write the main population tables. write_table(art_nm, 'population.structure', pop.get_population()) write_table(art_nm, 'cause.all_causes.disability_rate', pop.sample_disability_rate_from(dist_yld, smp_yld)) write_table(art_nm, 'cause.all_causes.mortality', pop.get_mortality_rate()) # Write the chronic disease tables. for name, disease_nm in diseaseList.chronic.items(): logger.info('{} Writing tables for {}'.format( datetime.datetime.now().strftime("%H:%M:%S"), name)) write_table( art_nm, 'chronic_disease.{}.incidence'.format(name), disease_nm.sample_i_from(dist_chronic_i, dist_chronic_apc, smp_chronic_i[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.remission'.format(name), disease_nm.sample_r_from(dist_chronic_r, dist_chronic_apc, smp_chronic_r[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.mortality'.format(name), disease_nm.sample_f_from(dist_chronic_f, dist_chronic_apc, smp_chronic_f[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.morbidity'.format(name), disease_nm.sample_yld_from(dist_chronic_yld, dist_chronic_apc, smp_chronic_yld[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.prevalence'.format(name), disease_nm.sample_prevalence_from(dist_chronic_prev, smp_chronic_prev[name])) # Write the acute disease tables. for name, disease_nm in diseaseList.acute.items(): logger.info('{} Writing tables for {}'.format( datetime.datetime.now().strftime("%H:%M:%S"), name)) write_table( art_nm, 'acute_disease.{}.mortality'.format(name), disease_nm.sample_excess_mortality_from(dist_acute_f, smp_acute_f[name])) write_table( art_nm, 'acute_disease.{}.morbidity'.format(name), disease_nm.sample_disability_from(dist_acute_yld, smp_acute_yld[name])) print(artifact_file)