def run_full_models_for_mcmc(burn_in: int, src_db_path: str, dest_db_path: str, build_model, params: dict): """ Run the full baseline model and all scenarios for all accepted MCMC runs in src db. """ src_db = Database(src_db_path) dest_db = Database(dest_db_path) logger.info("Copying mcmc_run table to %s", dest_db_path) mcmc_run_df = src_db.query("mcmc_run") # Apply burn in and save to destination burned_runs_str = ", ".join(mcmc_run_df[:burn_in].idx) logger.info("Burned MCMC runs %s", burned_runs_str) mcmc_run_df = mcmc_run_df[burn_in:] dest_db.dump_df("mcmc_run", mcmc_run_df) mcmc_runs = list(mcmc_run_df.T.to_dict().values()) for mcmc_run in mcmc_runs: meta = {k: v for k, v in mcmc_run.items() if k in META_COLS} if not meta["accept"]: logger.info("Ignoring non-accepted MCMC run %s", meta["idx"]) continue logger.info("Running full model for MCMC run %s", meta["idx"]) param_updates = { k: v for k, v in mcmc_run.items() if k not in META_COLS } run_idx = meta["idx"].split("_")[-1] def update_func(ps: dict): return update_params(ps, param_updates) with Timer("Running model scenarios"): num_scenarios = 1 + len(params["scenarios"].keys()) scenarios = [] for scenario_idx in range(num_scenarios): scenario = Scenario(build_model, scenario_idx, params) scenarios.append(scenario) # Run the baseline scenario. baseline_scenario = scenarios[0] baseline_scenario.run(update_func=update_func) baseline_model = baseline_scenario.model # Run all the other scenarios for scenario in scenarios[1:]: scenario.run(base_model=baseline_model, update_func=update_func) with Timer("Saving model outputs to the database"): models = [s.model for s in scenarios] store_run_models(models, dest_db_path, run_idx=run_idx) logger.info("Finished running full models for all accepted MCMC runs.")
def preprocess_demography(input_db: Database): loc_df = read_location_df() pop_df = read_population_df(loc_df) birth_df = read_crude_birth_df(loc_df) death_df = read_death_df(loc_df) expect_df = read_life_expectancy_df(loc_df) input_db.dump_df("countries", loc_df) input_db.dump_df("population", pop_df) input_db.dump_df("birth_rates", birth_df) input_db.dump_df("deaths", death_df) input_db.dump_df("life_expectancy", expect_df) return loc_df
def preprocess_mobility(input_db: Database, country_df): """ Read Google Mobility data from CSV into input database """ mob_df = pd.read_csv(MOBILITY_CSV_PATH) dhhs_cluster_mobility = reshape_to_clusters(mob_df) # Drop all sub-region 2 data, too detailed. major_region_mask = mob_df["sub_region_2"].isnull() & mob_df["metro_area"].isnull() davao_mask = mob_df.metro_area == "Davao City Metropolitan Area" mob_df = mob_df[major_region_mask | davao_mask].copy() # These two regions are the same mob_df.loc[(mob_df.sub_region_1 == "National Capital Region"), "sub_region_1"] = "Metro Manila" mob_df.loc[(mob_df.metro_area == "Davao City Metropolitan Area"), "sub_region_1"] = "Davao City" mob_df.loc[ (mob_df.sub_region_1 == "Federal Territory of Kuala Lumpur"), "sub_region_1" ] = "Kuala Lumpur" mob_df = mob_df.append(dhhs_cluster_mobility) # Drop all rows that have NA values in 1 or more mobility columns. mob_cols = [c for c in mob_df.columns if c.endswith(MOBILITY_SUFFIX)] mask = False for c in mob_cols: mask = mask | mob_df[c].isnull() mob_df = mob_df[~mask].copy() for c in mob_cols: # Convert percent values to decimal: 1.0 being no change. mob_df[c] = mob_df[c].apply(lambda x: 1 + x / 100) # Drop unused columns, rename kept columns cols_to_keep = [*mob_cols, "country_region", "sub_region_1", "date"] cols_to_drop = [c for c in mob_df.columns if not c in cols_to_keep] mob_df = mob_df.drop(columns=cols_to_drop) mob_col_rename = {c: c.replace(MOBILITY_SUFFIX, "") for c in mob_cols} mob_df.rename(columns={**mob_col_rename, "sub_region_1": "region"}, inplace=True) # Convert countries to ISO3 countries = mob_df["country_region"].unique().tolist() iso3s = {c: get_iso3(c, country_df) for c in countries} iso3_series = mob_df["country_region"].apply(lambda c: iso3s[c]) mob_df.insert(0, "iso3", iso3_series) mob_df = mob_df.drop(columns=["country_region"]) mob_df = mob_df.sort_values(["iso3", "region", "date"]) input_db.dump_df("mobility", mob_df)
def preprocess_social_mixing(input_db: Database, country_df): for location in LOCATIONS: for sheet_number, header_arg in SHEET_NUMBERS: sheet_name = f"MUestimates_{location}_{sheet_number}.xlsx" sheet_path = os.path.join(MIXING_DIRPATH, sheet_name) xl = pd.ExcelFile(sheet_path) sheet_names = xl.sheet_names iso3s = [get_iso3(n, country_df) for n in sheet_names] for idx, sheet_name in enumerate(sheet_names): iso3 = iso3s[idx] mix_df = pd.read_excel(xl, header=header_arg, sheet_name=sheet_name) if sheet_number == "2": renames = {n - 1: f"X{n}" for n in range(1, 17)} mix_df.rename(columns=renames, inplace=True) mix_df.insert(0, "location", [location for _ in range(len(mix_df))]) mix_df.insert(0, "iso3", [iso3 for _ in range(len(mix_df))]) input_db.dump_df("social_mixing", mix_df)
def test_plot_uncertainty(tmp_path): """ Ensure uncertainty plotting code works. """ output_dir = tmp_path powerbi_db_path = os.path.join(tmp_path, "powerbi.db") targets = { "incidence": { "output_key": "incidence", "title": "incidence", "times": [], "values": [], "quantiles": [0.25, 0.5, 0.75], }, "foo": { "output_key": "foo", "title": "foo", "times": [], "values": [], "quantiles": [0.25, 0.5, 0.75], }, } funcs = [ lambda t: 2 * t + random.random(), lambda t: t**3 + random.random() ] # Build data for plotting do_df, mcmc_df, _ = build_synthetic_calibration(targets, funcs, chains=2, runs=20, times=20) unc_df = calculate_mcmc_uncertainty(mcmc_df, do_df, targets) # Create database for plotting db = Database(powerbi_db_path) db.dump_df("mcmc_run", mcmc_df) db.dump_df("derived_outputs", do_df) db.dump_df("uncertainty", unc_df) # Create plots plot_uncertainty(targets, powerbi_db_path, output_dir) # Check plots expected_foo_path = os.path.join(tmp_path, "foo", "uncertainty-foo-0.png") expected_incidence_path = os.path.join(tmp_path, "incidence", "uncertainty-incidence-0.png") assert os.path.exists(expected_foo_path) assert os.path.exists(expected_incidence_path)
def preprocess_our_world_in_data(input_db: Database): df = pd.read_csv(OUR_WORLD_IN_DATA_CSV_PATH) # Replace the one strange value for test numbers in Malaysia df.loc[(df.iso_code == "MYS") & (df.new_tests > 1e5), "new_tests"] = np.nan input_db.dump_df("owid", df)
def preprocess_covid_phl(input_db: Database): df = pd.read_csv(COVID_PHL_CSV_PATH) df = create_region_aggregates(df) input_db.dump_df("covid_phl", df)
def preprocess_covid_au(input_db: Database): df = pd.read_csv(COVID_AU_CSV_PATH) input_db.dump_df("covid_au", df) df = pd.read_csv(COVID_LGA_CSV_PATH) df = reshape_to_clusters(df) input_db.dump_df("covid_dhhs_test", df)
def preprocess_social_mixing(input_db: Database, country_df): for location in LOCATIONS: for sheet_number, header_arg in SHEET_NUMBERS: sheet_name = f"MUestimates_{location}_{sheet_number}.xlsx" sheet_path = os.path.join(MIXING_DIRPATH, sheet_name) xl = pd.ExcelFile(sheet_path) sheet_names = xl.sheet_names iso3s = [get_iso3(n, country_df) for n in sheet_names] for idx, sheet_name in enumerate(sheet_names): iso3 = iso3s[idx] mix_df = pd.read_excel(xl, header=header_arg, sheet_name=sheet_name) if sheet_number == "2": renames = {n - 1: f"X{n}" for n in range(1, 17)} mix_df.rename(columns=renames, inplace=True) mix_df.insert(0, "location", [location for _ in range(len(mix_df))]) mix_df.insert(0, "iso3", [iso3 for _ in range(len(mix_df))]) input_db.dump_df("social_mixing", mix_df) # Next gen social mixing original_mm = input_db.query("social_mixing") df = pd.read_csv( os.path.join(MIXING_DIRPATH, "synthetic_contacts_2020.csv")) df = df[df.setting == "overall"] df.drop(columns="setting", inplace=True) df.replace( { "0 to 4": "00 to 04", "5 to 9": "05 to 09", "all": "all_locations", "others": "other_locations", }, inplace=True, ) # The contactor is in j (columns) and the contactee is in i (rows) df = df.pivot_table( index=["iso3c", "location_contact", "age_cotactee"], columns="age_contactor", values="mean_number_of_contacts", ) df = df.reset_index() df.drop(columns="age_cotactee", inplace=True) cols = list(df.columns[2:]) new_col = ["X" + str(x) for x in range(1, len(cols) + 1)] replace_col = dict(zip(cols, new_col)) df.rename(columns=replace_col, inplace=True) df.rename(columns={ "iso3c": "iso3", "location_contact": "location" }, inplace=True) iso3_diff = set(original_mm.iso3).difference(df.iso3) iso3_mask = original_mm.iso3.isin(iso3_diff) df = df.append(original_mm[iso3_mask], ignore_index=True) input_db.dump_df("social_mixing_2020", df)
def test_plot_post_calibration(tmp_path): plot_dir = tmp_path mcmc_dir_path = os.path.join(tmp_path, "mcmc") os.makedirs(mcmc_dir_path) targets = { "incidence": { "output_key": "incidence", "title": "incidence", "times": [], "values": [], "quantiles": [0.25, 0.5, 0.75], }, "foo": { "output_key": "foo", "title": "foo", "times": [], "values": [], "quantiles": [0.25, 0.5, 0.75], }, } # A dummy prior to pass postirior checks priors = [{ "param_name": "contact_rate", "distribution": "uniform", "distri_params": [0.01, 0.03] }] funcs = [ lambda t: 2 * t + random.random(), lambda t: t**3 + random.random() ] # Build data for plotting do_df, mcmc_df, params_df = build_synthetic_calibration(targets, funcs, chains=2, runs=20, times=20) chains = set(mcmc_df["chain"].tolist()) # Create databases for plotting for chain in chains: db_path = os.path.join(mcmc_dir_path, f"chain-{chain}.db") db = Database(db_path) db.dump_df("mcmc_run", mcmc_df[mcmc_df["chain"] == chain]) db.dump_df("mcmc_params", params_df[params_df["chain"] == chain]) db.dump_df("derived_outputs", do_df[do_df["chain"] == chain]) # Create plots plot_post_calibration(targets, mcmc_dir_path, plot_dir, priors) # Check plots - do a super basic check expected_files = [ "burn-in.png", "loglikelihood-traces.png", "acceptance_ratio.png", "params-traces", "calibration-fit", "params-vs-loglikelihood", "posteriors", ] for fname in expected_files: p = os.path.join(plot_dir, fname) assert os.path.exists(p) if os.path.isdir(p): assert len(os.listdir(p)) > 0