def open_artifact(output_path: Path, location: str) -> Artifact: """Creates or opens an artifact at the output path. Parameters ---------- output_path Fully resolved path to the artifact file. location Proper GBD location name represented by the artifact. Returns ------- A new artifact. """ if not output_path.exists(): logger.debug(f"Creating artifact at {str(output_path)}.") else: logger.debug(f"Opening artifact at {str(output_path)} for appending.") artifact = Artifact(output_path) key = data_keys.METADATA_LOCATIONS if key not in artifact: artifact.write(key, [location]) return artifact
def write_data(artifact: Artifact, key: EntityKey, data: pd.DataFrame): if str(key) in artifact: logger.debug(f'Data for {key} already in artifact. Skipping...') else: logger.debug(f'Writing data for {key} to artifact.') artifact.write(str(key), data) return artifact.load(str(key))
def load_and_write_data(artifact: Artifact, key: EntityKey, location: str): """Loads data and writes it to the artifact if not already present. Parameters ---------- artifact The artifact to write to. key The entity key associated with the data to write. location The location associated with the data to load and the artifact to write to. Notes ----- This function supports simple remapping of keys. Complex tailoring of input data should not use this function. To support appending, they should check for the write key in the artifact manually, then load any relevant data and transform it as necessary to write out using ``artifact.write``. """ if str(key) in artifact: logger.debug(f'Data for {key} already in artifact. Skipping...') else: logger.debug(f'Loading data for {key} for location {location}.') data = loader.get_data(key, location) logger.debug(f'Writing data for {key} to artifact.') artifact.write(str(key), data) return artifact.load(str(key))
def open_artifact(output_path: Path, location: str) -> Artifact: """Creates or opens an artifact at the output path. Parameters ---------- output_path Fully resolved path to the artifact file. location Proper GBD location name represented by the artifact. Returns ------- A new artifact. """ if not output_path.exists(): logger.debug(f"Creating artifact at {str(output_path)}.") else: logger.debug(f"Opening artifact at {str(output_path)} for appending.") artifact = Artifact(output_path, filter_terms=[get_location_term(location)]) key = EntityKey(project_globals.METADATA_LOCATIONS) if str(key) not in artifact: artifact.write(key, [location]) return artifact
def calc_hypertensive(location, draw): art_path = HYPERTENSION_DATA_FOLDER / f'{location}/data.hdf' art = Artifact(str(art_path), filter_terms=[f'draw=={draw}']) # I can drop indices and know that the means/sds/weights will be aligned b/c we sort the data in vivarium_inputs mean = art.load('risk_factor.high_systolic_blood_pressure.exposure') demographic_index = mean.index # but we'll need it later for the proportions mean = mean.reset_index(drop=True) sd = art.load( 'risk_factor.high_systolic_blood_pressure.exposure_standard_deviation' ).reset_index(drop=True) # these will be the same for all draws weights = prep_weights(art) threshold = pd.Series(HYPERTENSION_THRESHOLD, index=mean.index) dist = EnsembleDistribution(weights=weights, mean=mean[f'draw_{draw}'], sd=sd[f'draw_{draw}']) props = (1 - dist.cdf(threshold)).fillna( 0) # we want the proportion above the threshold props.index = demographic_index props.name = f'draw_{draw}' props = props.droplevel('parameter').fillna(0) return props
def load_and_write_data(artifact: Artifact, key: str, location: str, replace: bool): """Loads data and writes it to the artifact if not already present. Parameters ---------- artifact The artifact to write to. key The entity key associated with the data to write. location The location associated with the data to load and the artifact to write to. replace Flag which determines whether or not to overwrite existing data """ if key in artifact and not replace: logger.debug(f'Data for {key} already in artifact. Skipping...') else: logger.debug(f'Loading data for {key} for location {location}.') data = loader.get_data(key, location) if key not in artifact: logger.debug(f'Writing data for {key} to artifact.') artifact.write(key, data) else: # key is in artifact, but should be replaced logger.debug(f'Replacing data for {key} in artifact.') artifact.replace(key, data) return artifact.load(key)
def write_data(artifact: Artifact, key: str, data: pd.DataFrame): """Writes data to the artifact if not already present. Parameters ---------- artifact The artifact to write to. key The entity key associated with the data to write. data The data to write. """ if key in artifact: logger.debug(f'Data for {key} already in artifact. Skipping...') else: logger.debug(f'Writing data for {key} to artifact.') artifact.write(key, data) return artifact.load(key)
def load_and_write_data(artifact: Artifact, key: str, location: str): """Loads data and writes it to the artifact if not already present. Parameters ---------- artifact The artifact to write to. key The entity key associated with the data to write. location The location associated with the data to load and the artifact to write to. """ if key in artifact: logger.debug(f'Data for {key} already in artifact. Skipping...') else: logger.debug(f'Loading data for {key} for location {location}.') data = loader.get_data(key, location) logger.debug(f'Writing data for {key} to artifact.') artifact.write(key, data) return artifact.load(key)
def assemble_tobacco_artifacts(num_draws, output_path: Path, seed: int = RANDOM_SEED): """ Assemble the data artifacts required to simulate the various tobacco interventions. Parameters ---------- num_draws The number of random draws to sample for each rate and quantity, for the uncertainty analysis. output_path The path to the artifact being assembled. seed The seed for the pseudo-random number generator used to generate the random samples. """ data_dir_non_maori = get_data_dir('non-maori') data_dir_maori = get_data_dir('maori') prng = np.random.RandomState(seed=seed) logger = logging.getLogger(__name__) # Instantiate components for the non-Maori population. p_nm = Population(data_dir_non_maori, YEAR_START) l_nm = Diseases(data_dir_non_maori, YEAR_START, p_nm.year_end) t_nm = Tobacco(data_dir_non_maori, YEAR_START, p_nm.year_end) # Instantiate components for the Maori population. p_m = Population(data_dir_maori, YEAR_START) l_m = Diseases(data_dir_maori, YEAR_START, p_m.year_end) t_m = Tobacco(data_dir_maori, YEAR_START, p_m.year_end) # Define data structures to record the samples from the unit interval that # are used to sample each rate/quantity, so that they can be correlated # across both populations. smp_yld = prng.random_sample(num_draws) smp_chronic_apc = {} smp_chronic_i = {} smp_chronic_r = {} smp_chronic_f = {} smp_chronic_yld = {} smp_chronic_prev = {} smp_acute_f = {} smp_acute_yld = {} smp_tob_dis_tbl = {} smp_tob_i = prng.random_sample(num_draws) smp_tob_r = prng.random_sample(num_draws) smp_tob_elast = prng.random_sample(num_draws) smp_tob_elast_maori = prng.random_sample(num_draws) # Define the sampling distributions in terms of their family and their # *relative* standard deviation; they will be used to draw samples for # both populations. dist_yld = LogNormal(sd_pcnt=10) dist_chronic_apc = Normal(sd_pcnt=0.5) dist_chronic_i = Normal(sd_pcnt=5) dist_chronic_r = Normal(sd_pcnt=5) dist_chronic_f = Normal(sd_pcnt=5) dist_chronic_yld = Normal(sd_pcnt=10) dist_chronic_prev = Normal(sd_pcnt=5) dist_acute_f = Normal(sd_pcnt=10) dist_acute_yld = Normal(sd_pcnt=10) dist_tob_i = Beta(sd_pcnt=20) dist_tob_r = Beta(sd_pcnt=20) dist_tob_elast = Normal(sd_pcnt=20) dist_tob_elast_maori_scale = Normal(sd_pcnt=10) tob_elast_maori_scale = 1.2 logger.info('{} Generating samples'.format( datetime.datetime.now().strftime("%H:%M:%S"))) for name, disease_nm in l_nm.chronic.items(): # Draw samples for each rate/quantity for this disease. smp_chronic_apc[name] = prng.random_sample(num_draws) smp_chronic_i[name] = prng.random_sample(num_draws) smp_chronic_r[name] = prng.random_sample(num_draws) smp_chronic_f[name] = prng.random_sample(num_draws) smp_chronic_yld[name] = prng.random_sample(num_draws) smp_chronic_prev[name] = prng.random_sample(num_draws) # Also draw samples for the RR associated with tobacco smoking. smp_tob_dis_tbl[name] = prng.random_sample(num_draws) for name, disease_nm in l_nm.acute.items(): # Draw samples for each rate/quantity for this disease. smp_acute_f[name] = prng.random_sample(num_draws) smp_acute_yld[name] = prng.random_sample(num_draws) # Also draw samples for the RR associated with tobacco smoking. smp_tob_dis_tbl[name] = prng.random_sample(num_draws) # Now write all of the required tables for: # # - Both the Maori and non-Maori populations; and # - Both the 20-year and 0-year recovery from smoking. # # So there are 4 data artifacts to write. nm_artifact_fmt = 'mslt_tobacco_non-maori_{}-years.hdf' m_artifact_fmt = 'mslt_tobacco_maori_{}-years.hdf' logger.info('{} Generating artifacts'.format( datetime.datetime.now().strftime("%H:%M:%S"))) for recovery in [20, 0]: nm_artifact_file = output_path / nm_artifact_fmt.format(recovery) m_artifact_file = output_path / m_artifact_fmt.format(recovery) exposure = 'tobacco' # Initialise each artifact file. for path in [nm_artifact_file, m_artifact_file]: if path.exists(): path.unlink() # Write the data tables to each artifact file. art_nm = Artifact(str(nm_artifact_file)) art_m = Artifact(str(m_artifact_file)) logger.info('{} Writing population tables'.format( datetime.datetime.now().strftime("%H:%M:%S"))) # Write the main population tables. write_table(art_nm, 'population.structure', p_nm.get_population()) write_table(art_m, 'population.structure', p_m.get_population()) write_table(art_nm, 'cause.all_causes.disability_rate', p_nm.sample_disability_rate_from(dist_yld, smp_yld)) write_table(art_m, 'cause.all_causes.disability_rate', p_m.sample_disability_rate_from(dist_yld, smp_yld)) write_table(art_nm, 'cause.all_causes.mortality', p_nm.get_mortality_rate()) write_table(art_m, 'cause.all_causes.mortality', p_m.get_mortality_rate()) # Write the chronic disease tables. for name, disease_nm in l_nm.chronic.items(): logger.info('{} Writing tables for {}'.format( datetime.datetime.now().strftime("%H:%M:%S"), name)) disease_m = l_m.chronic[name] write_table( art_nm, 'chronic_disease.{}.incidence'.format(name), disease_nm.sample_i_from(dist_chronic_i, dist_chronic_apc, smp_chronic_i[name], smp_chronic_apc[name])) write_table( art_m, 'chronic_disease.{}.incidence'.format(name), disease_m.sample_i_from(dist_chronic_i, dist_chronic_apc, smp_chronic_i[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.remission'.format(name), disease_nm.sample_r_from(dist_chronic_r, dist_chronic_apc, smp_chronic_r[name], smp_chronic_apc[name])) write_table( art_m, 'chronic_disease.{}.remission'.format(name), disease_m.sample_r_from(dist_chronic_r, dist_chronic_apc, smp_chronic_r[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.mortality'.format(name), disease_nm.sample_f_from(dist_chronic_f, dist_chronic_apc, smp_chronic_f[name], smp_chronic_apc[name])) write_table( art_m, 'chronic_disease.{}.mortality'.format(name), disease_m.sample_f_from(dist_chronic_f, dist_chronic_apc, smp_chronic_f[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.morbidity'.format(name), disease_nm.sample_yld_from(dist_chronic_yld, dist_chronic_apc, smp_chronic_yld[name], smp_chronic_apc[name])) write_table( art_m, 'chronic_disease.{}.morbidity'.format(name), disease_m.sample_yld_from(dist_chronic_yld, dist_chronic_apc, smp_chronic_yld[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.prevalence'.format(name), disease_nm.sample_prevalence_from(dist_chronic_prev, smp_chronic_prev[name])) write_table( art_m, 'chronic_disease.{}.prevalence'.format(name), disease_m.sample_prevalence_from(dist_chronic_prev, smp_chronic_prev[name])) # Write the acute disease tables. for name, disease_nm in l_nm.acute.items(): logger.info('{} Writing tables for {}'.format( datetime.datetime.now().strftime("%H:%M:%S"), name)) disease_m = l_m.acute[name] write_table( art_nm, 'acute_disease.{}.mortality'.format(name), disease_nm.sample_excess_mortality_from( dist_acute_f, smp_acute_f[name])) write_table( art_m, 'acute_disease.{}.mortality'.format(name), disease_m.sample_excess_mortality_from(dist_acute_f, smp_acute_f[name])) write_table( art_nm, 'acute_disease.{}.morbidity'.format(name), disease_nm.sample_disability_from(dist_acute_yld, smp_acute_yld[name])) write_table( art_m, 'acute_disease.{}.morbidity'.format(name), disease_m.sample_disability_from(dist_acute_yld, smp_acute_yld[name])) # Write the risk factor tables. for name in [exposure]: logger.info('{} Writing tables for {}'.format( datetime.datetime.now().strftime("%H:%M:%S"), name)) write_table(art_nm, 'risk_factor.{}.incidence'.format(name), t_nm.sample_i_from(dist_tob_i, smp_tob_i)) write_table(art_m, 'risk_factor.{}.incidence'.format(name), t_m.sample_i_from(dist_tob_i, smp_tob_i)) write_table(art_nm, 'risk_factor.{}.remission'.format(name), t_nm.sample_r_from(dist_tob_r, smp_tob_r)) write_table(art_m, 'risk_factor.{}.remission'.format(name), t_m.sample_r_from(dist_tob_r, smp_tob_r)) if recovery == 0: # Cessation confers immediate recovery. write_table( art_nm, 'risk_factor.{}.mortality_relative_risk'.format(name), collapse_tobacco_mortality_rr( t_nm.get_expected_mortality_rr(), name)) write_table( art_m, 'risk_factor.{}.mortality_relative_risk'.format(name), collapse_tobacco_mortality_rr( t_m.get_expected_mortality_rr(), name)) write_table( art_nm, 'risk_factor.{}.disease_relative_risk'.format(name), collapse_tobacco_disease_rr( t_nm.sample_disease_rr_from(smp_tob_dis_tbl))) write_table( art_m, 'risk_factor.{}.disease_relative_risk'.format(name), collapse_tobacco_disease_rr( t_m.sample_disease_rr_from(smp_tob_dis_tbl))) write_table( art_nm, 'risk_factor.{}.prevalence'.format(name), collapse_tobacco_prevalence(t_nm.get_expected_prevalence(), name)) write_table( art_m, 'risk_factor.{}.prevalence'.format(name), collapse_tobacco_prevalence(t_m.get_expected_prevalence(), name)) else: write_table( art_nm, 'risk_factor.{}.mortality_relative_risk'.format(name), t_nm.get_expected_mortality_rr()) write_table( art_m, 'risk_factor.{}.mortality_relative_risk'.format(name), t_m.get_expected_mortality_rr()) write_table( art_nm, 'risk_factor.{}.disease_relative_risk'.format(name), t_nm.sample_disease_rr_from(smp_tob_dis_tbl)) write_table( art_m, 'risk_factor.{}.disease_relative_risk'.format(name), t_m.sample_disease_rr_from(smp_tob_dis_tbl)) write_table(art_nm, 'risk_factor.{}.prevalence'.format(name), t_nm.get_expected_prevalence()) write_table(art_m, 'risk_factor.{}.prevalence'.format(name), t_m.get_expected_prevalence()) logger.info('{} Tax effects (non-Maori)'.format( datetime.datetime.now().strftime("%H:%M:%S"))) tob_elast_nm = t_nm.sample_price_elasticity_from( dist_tob_elast, smp_tob_elast) incidence_effect_col = 'incidence_effect' remission_effect_col = 'remission_effect' tob_tax_nm = t_nm.sample_tax_effects_from_elasticity_wide( tob_elast_nm) incidence_cols = [ c for c in tob_tax_nm.columns if c != remission_effect_col ] remission_cols = [ c for c in tob_tax_nm.columns if c != incidence_effect_col ] df = tob_tax_nm.loc[:, incidence_cols].rename( columns={incidence_effect_col: 'value'}) write_table(art_nm, 'risk_factor.{}.tax_effect_incidence'.format(name), df) df = tob_tax_nm.loc[:, remission_cols].rename( columns={remission_effect_col: 'value'}) write_table(art_nm, 'risk_factor.{}.tax_effect_remission'.format(name), df) del tob_tax_nm logger.info('{} Tax effects (Maori)'.format( datetime.datetime.now().strftime("%H:%M:%S"))) tob_elast_m = t_m.scale_price_elasticity_from( tob_elast_nm, tob_elast_maori_scale, Normal(sd_pcnt=10), smp_tob_elast_maori) tob_tax_m = t_m.sample_tax_effects_from_elasticity_wide( tob_elast_m) incidence_cols = [ c for c in tob_tax_m.columns if c != remission_effect_col ] remission_cols = [ c for c in tob_tax_m.columns if c != incidence_effect_col ] df = tob_tax_m.loc[:, incidence_cols].rename( columns={incidence_effect_col: 'value'}) write_table(art_m, 'risk_factor.{}.tax_effect_incidence'.format(name), df) df = tob_tax_m.loc[:, remission_cols].rename( columns={remission_effect_col: 'value'}) write_table(art_m, 'risk_factor.{}.tax_effect_remission'.format(name), df) del tob_tax_m del tob_elast_nm del tob_elast_m print(nm_artifact_file) print(m_artifact_file)
def assemble_artifacts(num_draws, output_path: Path, seed: int = RANDOM_SEED): """ Assemble the data artifacts required to simulate the various tobacco interventions. Parameters ---------- num_draws The number of random draws to sample for each rate and quantity, for the uncertainty analysis. output_path The path to the artifact being assembled. seed The seed for the pseudo-random number generator used to generate the random samples. """ data_dir = get_data_dir('data') prng = np.random.RandomState(seed=seed) logger = logging.getLogger(__name__) # Instantiate components for the non-Maori population. pop = Population(data_dir, YEAR_START) diseaseList = Diseases(data_dir, YEAR_START, pop.year_end) # Define data structures to record the samples from the unit interval that # are used to sample each rate/quantity, so that they can be correlated # across both populations. smp_yld = prng.random_sample(num_draws) smp_chronic_apc = {} smp_chronic_i = {} smp_chronic_r = {} smp_chronic_f = {} smp_chronic_yld = {} smp_chronic_prev = {} smp_acute_f = {} smp_acute_yld = {} smp_tob_dis_tbl = {} # Define the sampling distributions in terms of their family and their # *relative* standard deviation; they will be used to draw samples for # both populations. dist_yld = LogNormal(sd_pcnt=10) dist_chronic_apc = Normal(sd_pcnt=0.5) dist_chronic_i = Normal(sd_pcnt=5) dist_chronic_r = Normal(sd_pcnt=5) dist_chronic_f = Normal(sd_pcnt=5) dist_chronic_yld = Normal(sd_pcnt=10) dist_chronic_prev = Normal(sd_pcnt=5) dist_acute_f = Normal(sd_pcnt=10) dist_acute_yld = Normal(sd_pcnt=10) logger.info('{} Generating samples'.format( datetime.datetime.now().strftime("%H:%M:%S"))) for name, disease_nm in diseaseList.chronic.items(): # Draw samples for each rate/quantity for this disease. smp_chronic_apc[name] = prng.random_sample(num_draws) smp_chronic_i[name] = prng.random_sample(num_draws) smp_chronic_r[name] = prng.random_sample(num_draws) smp_chronic_f[name] = prng.random_sample(num_draws) smp_chronic_yld[name] = prng.random_sample(num_draws) smp_chronic_prev[name] = prng.random_sample(num_draws) # Also draw samples for the RR associated with tobacco smoking. smp_tob_dis_tbl[name] = prng.random_sample(num_draws) for name, disease_nm in diseaseList.acute.items(): # Draw samples for each rate/quantity for this disease. smp_acute_f[name] = prng.random_sample(num_draws) smp_acute_yld[name] = prng.random_sample(num_draws) # Also draw samples for the RR associated with tobacco smoking. smp_tob_dis_tbl[name] = prng.random_sample(num_draws) # Now write all of the required tables artifact_fmt = 'pmslt_artifact.hdf' artifact_file = output_path / artifact_fmt logger.info('{} Generating artifacts'.format( datetime.datetime.now().strftime("%H:%M:%S"))) # Initialise each artifact file. for path in [artifact_file]: if path.exists(): path.unlink() # Write the data tables to each artifact file. art_nm = Artifact(str(artifact_file)) logger.info('{} Writing population tables'.format( datetime.datetime.now().strftime("%H:%M:%S"))) # Write the main population tables. write_table(art_nm, 'population.structure', pop.get_population()) write_table(art_nm, 'cause.all_causes.disability_rate', pop.sample_disability_rate_from(dist_yld, smp_yld)) write_table(art_nm, 'cause.all_causes.mortality', pop.get_mortality_rate()) # Write the chronic disease tables. for name, disease_nm in diseaseList.chronic.items(): logger.info('{} Writing tables for {}'.format( datetime.datetime.now().strftime("%H:%M:%S"), name)) write_table( art_nm, 'chronic_disease.{}.incidence'.format(name), disease_nm.sample_i_from(dist_chronic_i, dist_chronic_apc, smp_chronic_i[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.remission'.format(name), disease_nm.sample_r_from(dist_chronic_r, dist_chronic_apc, smp_chronic_r[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.mortality'.format(name), disease_nm.sample_f_from(dist_chronic_f, dist_chronic_apc, smp_chronic_f[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.morbidity'.format(name), disease_nm.sample_yld_from(dist_chronic_yld, dist_chronic_apc, smp_chronic_yld[name], smp_chronic_apc[name])) write_table( art_nm, 'chronic_disease.{}.prevalence'.format(name), disease_nm.sample_prevalence_from(dist_chronic_prev, smp_chronic_prev[name])) # Write the acute disease tables. for name, disease_nm in diseaseList.acute.items(): logger.info('{} Writing tables for {}'.format( datetime.datetime.now().strftime("%H:%M:%S"), name)) write_table( art_nm, 'acute_disease.{}.mortality'.format(name), disease_nm.sample_excess_mortality_from(dist_acute_f, smp_acute_f[name])) write_table( art_nm, 'acute_disease.{}.morbidity'.format(name), disease_nm.sample_disability_from(dist_acute_yld, smp_acute_yld[name])) print(artifact_file)