def _get_cause_risk_pairs(gbd_round_id): """ Returns most-detailed cause-risk pairs for the scalars pipeline. Args: gbd_round_id (int): gbd round id. Returns: pd.DataFrame: Most detailed cause-risk pairs. Raises: RuntimeError: If the sets of directly-modeled and calculated PAFs are not mutually exclusive """ if gbd_round_id == 4: return _get_cause_risk_pairs_gbd2016() engine = db.db_engine("fbd-dev-read", database="forecasting") session = sessionmaker(bind=engine)() cause_hierarchy_version_id = get_hierarchy_version_id( session, entity_type="cause", entity_set_id=CAUSE_SET_ID, gbd_round_id=gbd_round_id) causes = get_strategy_set(session, strategy_id=CAUSE_STRATEGY_ID, hierarchy_id=cause_hierarchy_version_id) rei_hierarchy_version_id = get_hierarchy_version_id( session, entity_type="risk", entity_set_id=REI_SET_ID, gbd_round_id=gbd_round_id) risks = get_strategy_set(session, strategy_id=REI_STRATEGY_ID, hierarchy_id=rei_hierarchy_version_id) cr_hierarchy_version_id = get_hierarchy_version_id( session, entity_type="cause_risk_pair", entity_set_id=CAUSE_RISK_SET_ID, gbd_round_id=gbd_round_id) calculated_paf_set = get_strategy_set( session, strategy_id=CAUSE_RISK_STRATEGY_ID, hierarchy_id=cr_hierarchy_version_id) directly_modeled_paf_set = get_directly_modeled_pafs(gbd_round_id) crs = pd.concat([calculated_paf_set, directly_modeled_paf_set]) if crs.duplicated().any(): err_msg = ("The sets of directly-modeled and calculated PAFs are not " "mutually exclusive") LOGGER.error(err_msg) raise RuntimeError(err_msg) cause_risk_pairs = crs[["cause_id", "rei_id"]].\ merge(causes[["cause_id", "acause"]], on="cause_id").\ merge(risks[["rei_id", "rei"]], on="rei_id") session.close() engine.dispose() return cause_risk_pairs
def get_vaccine_reis(gbd_round_id): """Returns the list of risks that are interventions, e.g. vaccines, such as dtp3. Args: gbd_round_id (int): Numeric ID for the GBD round. Returns: tuple: The reis of the risks that are interventions. """ if gbd_round_id == 4: return "dtp3", "measles", "rota", "pcv", "hib" engine = db.db_engine("fbd-dev-read", database="forecasting") session = sessionmaker(bind=engine)() rei_hierarchy_version_id = get_hierarchy_version_id( session, entity_type="risk", entity_set_id=REI_SET_ID, gbd_round_id=gbd_round_id) return tuple( get_strategy_set(session, strategy_id=REI_INTERVENTION_ID, hierarchy_id=rei_hierarchy_version_id)["rei"])
def get_maybe_negative_paf_pairs(gbd_round_id): """Get cause-risk pairs that *can* have negative PAFs, because they *can* be protective""" if gbd_round_id == 4: # Unfortunately these have to be hard-coded because we don't have # strategy sets for GBD 2016. return pd.DataFrame({ "acause": ["cvd_ihd", "cvd_stroke_isch", "diabetes", "neo_breast", "neuro_parkinsons"], "rei": ["drugs_alcohol", "drugs_alcohol", "drugs_alcohol", "metab_bmi", "smoking_direct_prev"], "cause_id": [493, 495, 587, 429, 544], "rei_id": [102, 102, 102, 108, 166] }) engine = db.db_engine("fbd-dev-write", database="forecasting") session = sessionmaker(bind=engine)() cause_risk_hierarchy_version_id = get_hierarchy_version_id( session, entity_type="cause_risk_pair", entity_set_id=CAUSE_RISK_SET_ID, gbd_round_id=gbd_round_id) maybe_negative_paf_set = get_strategy_set( session, strategy_id=CAUSE_RISK_MAYBE_NEGATIVE_PAF_SET_ID, hierarchy_id=cause_risk_hierarchy_version_id) # Set only has cause_ids and rei_ids, so get acauses acause_cause_id_map = _acauses( maybe_negative_paf_set["cause_id"].unique()) maybe_negative_paf_set_with_acause = maybe_negative_paf_set.merge( acause_cause_id_map, how="left") # Ensure that all cause-ids have acauses acauses_missing = ( maybe_negative_paf_set_with_acause["acause"].notnull().any()) acause_err_msg = "Some causes don't have acauses" assert acauses_missing, acause_err_msg # ... and get reis. rei_rei_id_map = _reis( maybe_negative_paf_set_with_acause["rei_id"].unique()) maybe_negative_paf_set_with_rei = ( maybe_negative_paf_set_with_acause.merge(rei_rei_id_map, how="left")) # Ensure that all rei-ids have reis reis_missing = maybe_negative_paf_set_with_rei["rei"].notnull().any() rei_err_msg = "Some reis don't have reis" assert reis_missing, rei_err_msg session.close() engine.dispose() return maybe_negative_paf_set_with_rei
def _get_y_hat(acause, input_version, agg_version, measure, period, draws, gbd_round_id): """Gets expected value of cause specific mortality or yld rates. For modeled causes, if the data is split by sex, then it is assumed that it is in log rate space. If the data is not split by sex, then it is assumed that it is in normal rate space. For aggregate causes, it is assumed that the data is not split by sex and is saved in log rate space. The resulting y_hat is in log rate space. :param str acause: name of the target acause to aggregate to. :param str mort_version: name of the mortality or yld version the aggregate is based on. :param str agg_version: name of the aggregate version. :return xarray.DataArray: The expected value of the cause specific mortality or yld rate. """ # read GK modeled-level (most-detailed) causes from database engine = db.db_engine(NAME, database=DATABASE) session = sessionmaker(bind=engine)() gk_causes = get_strategy_set(session, FATAL_GK_STRATEGY_ID, CAUSE_HIERARCHY_ID)["acause"].values if acause in gk_causes: logger.info("{} is a modeled cause.".format(acause)) y_hat = _get_modeled_y_hat(acause, input_version, measure, period, gbd_round_id, draws) else: logger.info("{} is an aggregated cause.".format(acause)) y_hat = _get_aggregated_y_hat(acause, agg_version, measure, period, gbd_round_id) if isinstance(y_hat, xr.Dataset): if len(y_hat.data_vars) == 1: y_hat.rename({list(y_hat.data_vars.keys())[0]: "value"}, inplace=True) return y_hat["value"] logger.info("Using __xarray_dataarray_variable__, " "but other data_vars are present! (probably just acause)") y_hat.rename({"__xarray_dataarray_variable__": "value"}, inplace=True) else: y_hat.name = "value" return y_hat
def get_directly_modeled_pafs(gbd_round_id): """Get cause-risk pairs that have directly-modeled PAFs""" engine = db.db_engine("fbd-dev-write", database="forecasting") session = sessionmaker(bind=engine)() if gbd_round_id == 4: gbd_round_id = 5 # use gbd2017 data cause_risk_hierarchy_version_id = get_hierarchy_version_id( session, entity_type="cause_risk_pair", entity_set_id=CAUSE_RISK_SET_ID, gbd_round_id=gbd_round_id) directly_modeled_paf_set = get_strategy_set( session, strategy_id=CAUSE_RISK_DIRECTLY_MODELED_SET_ID, hierarchy_id=cause_risk_hierarchy_version_id) # Set_only has cause_ids and rei_ids, so get acauses acause_cause_id_map = _acauses( directly_modeled_paf_set["cause_id"].unique()) directly_modeled_paf_set_with_acause = directly_modeled_paf_set.merge( acause_cause_id_map, how="left") # Ensure that all cause-ids have acauses acauses_missing = ( directly_modeled_paf_set_with_acause["acause"].notnull().any()) acause_err_msg = "Some causes don't have acauses" assert acauses_missing, acause_err_msg # ... and get reis. rei_rei_id_map = _reis( directly_modeled_paf_set_with_acause["rei_id"].unique()) directly_modeled_paf_set_with_rei = ( directly_modeled_paf_set_with_acause.merge(rei_rei_id_map, how="left")) # Ensure that all rei-ids have reis reis_missing = directly_modeled_paf_set_with_rei["rei"].notnull().any() rei_err_msg = "Some reis don't have reis" assert reis_missing, rei_err_msg session.close() engine.dispose() return directly_modeled_paf_set_with_rei
def main(version, years, draws, oos, subnational, dryrun=False): """ Run all cause-specific cod models with the current modeling framework. This file must be run from the `scripts` directory of fbd_cod. Args: model (str): version name to use for the current mortality run. years (fbd_core.argparse.YearRange): years to load and model, e.g. (1990:2017:2040). draws (int): how many draws to run through the pipeline oos (bool): whether to hold a time series out of sample for validation and comparison subnational (bool): whether or not to include the 93 subnational csu locations in the model """ # NOTE: This wont work if you install fbd_cod and attempt to run # this script from the command line, but it never did file_dir = os.path.dirname(os.path.realpath(__file__)) execf_ = os.path.join(file_dir, "run_cod_model.py") make_run_log_file(version) years_str = years.__str__() threads = 30 qsub_template = ( "qsub -b y -l m_mem_free={memory}G -l fthread={threads} -q all.q -now no -P proj_forecasting " "-N {acause}_{sex}_{version}{dryrun} " "{python} {exec_file} -v " "-c {acause} -s {sex} --version {version} --years {years} --draws {" "draws} " #"--spline {sdi_interaction} {oos} {subnat} {dryrun}") "{sdi_interaction} {oos} {subnat} {dryrun}") # create a db connection to get the strategy set for fatal GK causes engine = db.db_engine("fbd-dev-read", database="forecasting") session = sessionmaker(bind=engine)() cols_to_keep = ["acause", "male", "female"] fatal_gk_causes = get_strategy_set(session, 18, 303)[cols_to_keep] for _, row in fatal_gk_causes.iterrows(): for sex_id in settings.SEX_DICT.keys(): sex_name = settings.SEX_DICT[sex_id] if not row[sex_name] == 1: continue acause = row["acause"] if acause in settings.INTERACTION_CAUSES: sdi_interaction = "--sdi-interaction" else: sdi_interaction = "" oos_arg = "--oos" if oos else "" subnat_arg = "--subnational" if subnational else "" dryrun_arg = "--dryrun" if dryrun else "" if (acause.startswith("ckd")) or (acause == "nutrition_pem"): memory = 500 if draws == 1000 else 75 else: memory = 400 if draws == 1000 else 75 qsub = qsub_template.format(memory=memory, threads=threads, acause=acause, sex=sex_id, years=years_str, version=version, exec_file=execf_, draws=draws, python=settings.PYTHON_EXEC, sdi_interaction=sdi_interaction, oos=oos_arg, subnat=subnat_arg, dryrun=dryrun_arg) print(qsub) os.popen(qsub)
def _get_aggregated_y_hat(acause, version, measure, period, gbd_round_id): """Gets expected value of cause specific mortality rates. For aggregate causes, it is assumed that the data is not split by sex and is saved in log rate space. When the children are added to form the aggregated acause result, the summation happens in normal space. Therefore, we must exponentiate the children's rates, add them up, and log them to get an aggregated y_hat in log rate space. The resulting y_hat is in log rate space. :param str acause: name of the target acause to aggregate to. :param str version: name of the aggregation version. :return xarray.DataArray: The expected value of the cause specific mortality rate. """ # connect to db and read in cause hierarchy engine = db.db_engine(NAME, database=DATABASE) session = sessionmaker(bind=engine)() all_causes = get_hierarchy(session, "cause", CAUSE_HIERARCHY_ID)[[ "acause", "cause_id", "parent_id" ]] # subset to just fatal causes cause_strategy_set = get_strategy_set(session, FATAL_GK_STRATEGY_ID, CAUSE_HIERARCHY_ID) cause_hierarchy = get_hierarchy(session, "cause", CAUSE_HIERARCHY_ID) cause_tree, node_map = subset_fatal.make_hierarchy_tree( cause_hierarchy, 294, "cause_id") fatal_subset = subset_fatal.include_up_hierarchy( cause_tree, node_map, cause_strategy_set["cause_id"].values) fatal_causes = all_causes[all_causes.cause_id.isin(fatal_subset)] cause_id = fatal_causes[fatal_causes.acause == acause].cause_id.values[0] children = fatal_causes.query( "parent_id == {}".format(cause_id))["acause"].values logger.info("y_hat is a sum of children: {}".format(children)) # Create a list of child acause files which are not external causes and # check to make sure all the ones we want to sum up are actually present. potential_child_files = [ FBDPath("/{gri}/{p}/{m}/{v}/{c}_hat.nc".format(gri=gbd_round_id, p=period, m=measure, v=version, c=child), root_dir="scratch") for child in children if child not in ("_all", "_none") ] child_files = [ str(child_file) for child_file in potential_child_files if child_file.exists() ] if len(potential_child_files) != len(child_files): logger.error("You are missing files, bud. {} vs {}".format( potential_child_files, child_files)) raise Exception("Missing y_hat files!") logger.debug("Summing these files: {}".format(child_files)) exp_y_hat_sum = None for child_file in child_files: logger.info("Adding {}".format(child_file)) exp_y_hat = xr.ufuncs.exp( xr.open_dataarray(child_file, drop_variables=["measure", "cov"])) if exp_y_hat_sum is None: exp_y_hat_sum = exp_y_hat else: exp_y_hat_broadcasted = xr.broadcast(exp_y_hat_sum, exp_y_hat) exp_y_hat_broadcasted = [ data.fillna(0.) for data in exp_y_hat_broadcasted ] exp_y_hat_sum = sum(exp_y_hat_broadcasted) y_hat = xr.ufuncs.log(exp_y_hat_sum) y_hat.coords["acause"] = acause return y_hat
smoothing = { 0: ["location_id", "sex_id", "age_group_id"], 1: ["location_id", "sex_id", "age_group_id"], 2: ["region_id", "sex_id", "age_group_id"], 3: ["super_region_id", "sex_id", "age_group_id"], "modeled": ["super_region_id", "sex_id", "age_group_id"] } # create a session to read in the relevant causes engine = db.db_engine(NAME, DATABASE) session = sessionmaker(bind=engine)() all_causes = get_hierarchy(session, "cause", CAUSE_HIERARCHY_ID)[[ "acause", "cause_id", "parent_id", "level" ]] cause_strategy_set = get_strategy_set(session, GK_STRATEGY_SET_ID, CAUSE_HIERARCHY_ID) cause_hierarchy = get_hierarchy(session, "cause", CAUSE_HIERARCHY_ID) cause_tree, node_map = subset_fatal.make_hierarchy_tree( cause_hierarchy, 294, "cause_id") fatal_subset = subset_fatal.include_up_hierarchy( cause_tree, node_map, cause_strategy_set["cause_id"].values) fatal_causes = all_causes[all_causes.cause_id.isin(fatal_subset)] modeled_causes = fatal_causes.query("cause_id not in parent_id.values") aggregate_causes = fatal_causes.query("cause_id in parent_id.values") # Grab causes that are not modeled. There should be no level 4 causes. assert len(aggregate_causes.query("level == 4")) == 0 hold_jids = ["1"] arima_jids = ["1"] for level in ["modeled", 3, 2, 1, 0]: