def assemble_time_series(df): ts = get_time_series(df) deltas = ts[schema.keys()]\ .rename(columns = schema) deltas = deltas.reindex(pd.date_range(deltas.index.min(), deltas.index.max()), fill_value=0) merged = deltas.merge(deltas.cumsum(axis=0).rename(columns=lambda _: _[1]), left_index=True, right_index=True).astype(int) merged.index.name = "date" merged.columns.name = None return merged
def get_state_timeseries(states=["Tamil Nadu"], download: bool = False) -> pd.DataFrame: paths = { "v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in range(3, 25)] } if download: for target in paths['v3'] + paths['v4']: download_data(data, target) return load_all_data(v3_paths = [data/filepath for filepath in paths['v3']], v4_paths = [data/filepath for filepath in paths['v4']])\ .query("detected_state in @states" if states != "*" else "detected_state != 'NULL'", engine = "python")\ .pipe(lambda _: get_time_series(_, ["detected_state", "detected_district"]))\ .drop(columns = ["date", "time", "delta", "logdelta"])\ .rename(columns = { "Deceased": "dD", "Hospitalized": "dT", "Recovered": "dR" })
def get_state_timeseries( states = "*", download: bool = False, aggregation_cols = ["detected_state", "detected_district"], last_API_file: int = 27) -> pd.DataFrame: """ load state- and district-level data, downloading source files if specified """ paths = {"v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in range(3, last_API_file)]} if download: for target in paths['v3'] + paths['v4']: download_data(data, target) return load_all_data(v3_paths = [data/filepath for filepath in paths['v3']], v4_paths = [data/filepath for filepath in paths['v4']])\ .query("detected_state in @states" if states != "*" else "detected_state != 'NULL'")\ .pipe(lambda _: get_time_series(_, aggregation_cols))\ .drop(columns = ["date", "time", "delta", "logdelta"])\ .rename(columns = { "Deceased": "dD", "Hospitalized": "dT", "Recovered": "dR" })
plt.set_theme("substack") # define data versions for api files paths = { "v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in range(3, 26)] } # for target in paths['v3'] + paths['v4']: # download_data(data, target) df = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']], v4_paths=[data / filepath for filepath in paths['v4']]) data_recency = str(df["date_announced"].max()).split()[0] run_date = str(pd.Timestamp.now()).split()[0] ts = get_time_series(df, "detected_state") states = [ "Maharashtra", "Punjab", "West Bengal", "Bihar", "Delhi", "Andhra Pradesh", "Telangana", "Tamil Nadu", "Madhya Pradesh" ] for state in states[:1]: print(state) print(" + running estimation...") (inf_dates, inf_Rt_pred, inf_Rt_CI_upper, inf_Rt_CI_lower, inf_T_pred, inf_T_CI_upper, inf_T_CI_lower, inf_total_cases, inf_new_cases_ts, inf_anomalies, inf_anomaly_dates) = analytical_MPVS( ts.loc[state].Hospitalized, CI=CI,
plt.sca(lax.twinx()) plt.plot(df["TT"][:, "delta", "confirmed"].index, smoothed(df["TT"][:, "delta", "confirmed"].values), label = "Daily Cases", color = plt.PRED_PURPLE) plt.legend(loc = 'upper right') plt.PlotDevice().ylabel("new cases", rotation = -90, labelpad = 50) plt.ylim(bottom = 0) plt.sca(lax) plt.show() # cases vs deaths from pathlib import Path data = Path("./data") paths = {"v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in range(3, 27)]} for target in paths['v3'] + paths['v4']: download_data(data, target) df = load_all_data(v3_paths = [data/filepath for filepath in paths['v3']], v4_paths = [data/filepath for filepath in paths['v4']])\ .pipe(lambda _: get_time_series(_, ["detected_state"]))\ .drop(columns = ["date", "time", "delta", "logdelta"])\ .rename(columns = { "Deceased": "dD", "Hospitalized": "dT", "Recovered": "dR" }).sum(level = -1).sort_index() plt.plot(df.index, smoothed(df.dD.values), label = "Daily Deaths", color = plt.RED) plt.text(s = "national lockdown", x = pd.to_datetime("April 27, 2020"), y = 200, fontdict = plt.theme.note, ha = "center", va = "top") plt.legend(loc = 'upper left') plt.ylim(bottom = 0) lax = plt.gca() plt.sca(lax.twinx()) plt.plot(df.index, smoothed(df.dT.values), label = "Daily Cases", color = plt.PRED_PURPLE) plt.legend(loc = 'upper right')
CI = 0.95 paths = { "v3": [data_path(_) for _ in (1, 2)], "v4": [data_path(_) for _ in range(3, 18)] } for target in paths['v3'] + paths['v4']: download_data(data, target) dfn = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']], v4_paths=[data / filepath for filepath in paths['v4']]) delay = pd.read_csv(data / "bihar_delay.csv").set_index("delay") state_ts = get_time_series(dfn, "detected_state").loc["Bihar"].Hospitalized # state_ts = delay_adjust(state_ts, np.squeeze(delay.values)) state_ts = state_ts[state_ts.index >= "2020-03-26"] district_names, population_counts, _ = etl.district_migration_matrix( data / "Migration Matrix - District.csv") populations = dict(zip(district_names, population_counts)) # first, look at state level predictions (dates, Rt_pred, Rt_CI_upper, Rt_CI_lower, T_pred, T_CI_upper, T_CI_lower, total_cases, new_cases_ts, anomalies, anomaly_dates) = analytical_MPVS( state_ts, CI=CI, smoothing=notched_smoothing(window=smoothing), totals=False) plt.Rt(dates, Rt_pred[1:], Rt_CI_upper[1:], Rt_CI_lower[1:], CI, ymin=0, ymax=3)\
# define data versions for api files paths = { "v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in (3, 4, 5, 6, 7, 8)] } # download data from india covid 19 api for target in paths['v3'] + paths['v4']: download_data(data, target) df = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']], v4_paths=[data / filepath for filepath in paths['v4']]) data_recency = str(df["date_announced"].max()).split()[0] run_date = str(pd.Timestamp.now()).split()[0] ts = get_time_series(df[df.detected_state == "Delhi"]) (dates, RR_pred, RR_CI_upper, RR_CI_lower, T_pred, T_CI_upper, T_CI_lower, total_cases, new_cases_ts, anomalies, anomaly_dates) = analytical_MPVS(ts.delta[ts.delta > 0], CI=CI, smoothing=convolution(window=smoothing)) #= analytical_MPVS(ts.Hospitalized[ts.Hospitalized > 0], CI = CI, smoothing = lambda ts: box_filter(ts, smoothing, 10)) np.random.seed(33) delhi = Model([ ModelUnit("Delhi", 18_000_000, I0=T_pred[-1], RR0=RR_pred[-1], mobility=0)
"v4": [ "raw_data3.csv", "raw_data4.csv", "raw_data5.csv", "raw_data6.csv", "raw_data7.csv", "raw_data8.csv", "raw_data9.csv", "raw_data10.csv", "raw_data11.csv" ] } # download data from india covid 19 api for target in paths['v3'] + paths['v4']: download_data(data, target) # run rolling regressions on historical national case data dfn = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']], v4_paths=[data / filepath for filepath in paths['v4']]) data_recency = str(dfn["date_announced"].max()).split()[0] tsn = get_time_series(dfn) grn = estimate(tsn, smoothing) # disaggregate down to states tss = get_time_series(dfn, 'detected_state').loc[states] grs = tss.groupby(level=0).apply(lambda ts: estimate(ts, smoothing)) # voluntary and mandatory reproductive numbers Rvn = np.mean(grn["2020-03-24":"2020-03-31"].R) Rmn = np.mean(grn["2020-04-01":].R) Rvs = { s: np.mean(grs.loc[s].loc["2020-03-24":"2020-03-31"].R) if s in grs.index else Rvn for s in states
"v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in range(3, 26)] } for target in paths['v3'] + paths['v4']: try: download_data(data, target) except: pass df = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']], v4_paths=[data / filepath for filepath in paths['v4']]) data_recency = str(df["date_announced"].max()).split()[0] run_date = str(pd.Timestamp.now()).split()[0] ts = get_time_series(df, ["detected_state", "detected_district"]) focus = ts.loc[[ "Maharashtra", "Madhya Pradesh", "Gujarat", "West Bengal", "Tamil Nadu" ]] district_estimates = [] for (state, district) in focus.index.droplevel(-1).unique(): if district in ["Unknown", "Other State"]: continue print(state, district) try: (dates, Rt_pred, RR_CI_upper, RR_CI_lower, T_pred, T_CI_upper, T_CI_lower, total_cases, new_cases_ts, anomalies, anomaly_dates) = analytical_MPVS( focus.loc[state, district].Hospitalized,
"v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in range(3, 26)] } # for target in paths['v3'] + paths['v4']: # # try: # # download_data(data, target) # # except: # # pass df = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']], v4_paths=[data / filepath for filepath in paths['v4']]) data_recency = str(df["date_announced"].max()).split()[0] run_date = str(pd.Timestamp.now()).split()[0] ts = get_time_series(df) #, ["detected_state", "detected_district"]) one_day = pd.Timedelta(days=1) # fig 1 infections = ts[ ts.date >= "May 01, 2020"].Hospitalized #.sum(level = 2).sort_index() smoothed = convolution("uniform") scatter = plt.scatter(infections.index[:-7], infections.values[:-7], color="#CC4C75", marker="s", s=5, alpha=0.5) lineplot, = plt.plot(infections.index[:-7],
# set to cloud temp directory if not explicitly told to run locally root = cwd() if len(sys.argv) > 1 and sys.argv[1] == "--local" else Path( "/tmp") data = root / "data" # model details gamma = 0.2 smoothing = 10 CI = 0.95 download_data(data, 'state_wise_daily.csv') state_df = load_statewise_data(data / "state_wise_daily.csv") country_time_series = get_time_series(state_df) estimates = [] timeseries = [] # country level (dates, RR_pred, RR_CI_upper, RR_CI_lower, T_pred, T_CI_upper, T_CI_lower, total_cases, new_cases_ts, anomalies, anomaly_dates) = analytical_MPVS( country_time_series["Hospitalized"].iloc[:-1], CI=CI, smoothing=notched_smoothing(window=smoothing)) country_code = state_name_lookup["India"] for row in zip(dates, RR_pred, RR_CI_upper, RR_CI_lower): timeseries.append((country_code, *row))
"v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in range(3, 27)] } for target in paths['v3'] + paths['v4']: try: download_data(data, target) except: pass df = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']], v4_paths=[data / filepath for filepath in paths['v4']]) data_recency = str(df["date_announced"].max()).split()[0] run_date = str(pd.Timestamp.now()).split()[0] ts = get_time_series(df, "detected_state") states = [ "Tamil Nadu", "Karnataka" ] #["Maharashtra", "Punjab", "West Bengal", "Bihar", "Delhi", "Andhra Pradesh", "Telangana", "Tamil Nadu", "Madhya Pradesh"] for state in states: print(state) print(" + running estimation...") (dates, Rt_pred, RR_CI_upper, RR_CI_lower, T_pred, T_CI_upper, T_CI_lower, total_cases, new_cases_ts, anomalies, anomaly_dates) = analytical_MPVS( ts.loc[state].Hospitalized, CI=CI, smoothing=notched_smoothing(window=smoothing), totals=False) estimates = pd.DataFrame(
b = convolve(b1, b2) a = convolve(a1, a2) notched = pd.Series(filtfilt(b, a, ts)) notched.index = ts.index return notched root = cwd() data = mkdir(root / "data") figs = mkdir(root / "figs") ########################################################### # download latest case data download_data(data, 'state_wise_daily.csv') df = load_statewise_data(data / "state_wise_daily.csv") ts = get_time_series(df, "state") ########################################################### # load delay data api_diff = pd.read_csv(data / "daily_diff.csv", parse_dates=["status_change_date", "report_date"], dayfirst=True) delay = api_diff[(api_diff.current_status == "Hospitalized") & (api_diff.report_date > "2020-08-02")].copy() delay = delay.drop( columns=[col for col in delay.columns if col.startswith("Unnamed")] + ["rowhash"]) delay["newhash"] = delay[[ "patient_number", "date_announced", "detected_district", "detected_state", "current_status", "status_change_date", "num_cases" ]].apply(lambda x: hash(tuple(x)), axis=1)
ywf = fft(y * w) xf = np.linspace(0.0, 1.0 / (2.0 * T), N // 2) plt.plot(xf[1:N // 2], 2 / N * np.abs(ywf[1:N // 2]), ".", alpha=0.7, label=label) root = cwd() data = root / "data" figs = root / "figs" download_data(data, 'state_wise_daily.csv') state_df = load_statewise_data(data / "state_wise_daily.csv") natl_time_series = get_time_series(state_df) time_series = get_time_series(state_df, 'state') # is there chunking in reporting? print("checking average infection differentials...") time_series["delta_I"] = time_series.groupby(level=0)['Hospitalized'].diff() time_series["dow"] = time_series.index.get_level_values(1).dayofweek plot_average_change(time_series, "(All India)", filename=figs / "avg_delta_I_DoW_India.png") for state in tqdm(time_series.index.get_level_values(0).unique()): plot_average_change(time_series.loc[state], f"({state})", filename=figs / f"avg_delta_I_DoW_{state}.png") # are anomalies falling on certain days?
# public data paths = { "v3": [data_path(_) for _ in (1, 2)], "v4": [data_path(_) for _ in range(3, 13)] } for target in paths['v3'] + paths['v4']: download_data(data, target) dfn = load_all_data( v3_paths = [data/filepath for filepath in paths['v3']], v4_paths = [data/filepath for filepath in paths['v4']] ) state_ts = get_time_series(dfn, "detected_state").loc["Bihar"] district_names, population_counts, _ = etl.district_migration_matrix(data/"Migration Matrix - District.csv") populations = dict(zip(district_names, population_counts)) # first, look at state level predictions (dates_public, RR_pred_public, RR_CI_upper_public, RR_CI_lower_public, T_pred_public, T_CI_upper_public, T_CI_lower_public, total_cases_public, new_cases_ts_public, anomalies_public, anomaly_dates_public) = analytical_MPVS(state_ts.Hospitalized, CI = CI, smoothing = convolution(window = smoothing)) plt.plot(dates_public, RR_pred_public, label = "Estimated $R_t$", color = "midnightblue") plt.fill_between(dates_public, RR_CI_lower_public, RR_CI_upper_public, label = f"{100*CI}% CI", color = "midnightblue", alpha = 0.3) plt.legend(["private data estimate", "public data estimate"]) plt.show() np.random.seed(33) Bihar = Model([ModelUnit("Bihar", 99_000_000, I0 = T_pred[-1], RR0 = RR_pred[-1], mobility = 0)]) Bihar.run(14, np.zeros((1,1)))
# cutoff = None # cutoff = "April 7, 2021" cutoff = "April 14, 2021" if cutoff: df = df[df.date_announced <= cutoff] data_recency = str(df["date_announced"].max()).split()[0] run_date = str(pd.Timestamp.now()).split()[0] ts = get_time_series( df[df.detected_state == "Tamil Nadu"], ["detected_state", "detected_district"] )\ .drop(columns = ["date", "time", "delta", "logdelta"])\ .rename(columns = { "Deceased": "dD", "Hospitalized": "dT", "Recovered": "dR" }).droplevel(0)\ .drop(labels = ["Other State", "Railway Quarantine", "Airport Quarantine"]) district_estimates = [] simulation_initial_conditions = pd.read_csv(data/f"all_india_coalesced_initial_conditions{simulation_start.strftime('%b%d')}.csv")\ .drop(columns = ["Unnamed: 0"])\ .set_index(["state", "district"])\ .loc["Tamil Nadu"] def setup(district) -> Tuple[Callable[[str], SIR], pd.DataFrame]:
sero["hr"] = sero.hom_region.map(hom_regions_numeric) # pull down COVID 19 India data paths = { "v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in range(3, 19)] } # for target in paths['v3'] + paths['v4']: # download_data(data, target) df = load_all_data(v3_paths = [data/filepath for filepath in paths['v3']], v4_paths = [data/filepath for filepath in paths['v4']])\ .query("detected_state == 'Karnataka'") # get all deaths in KA on Aug 29 by district get_time_series(df, "detected_district")\ .query("status_change_date <= 'Aug 29, 2020'", engine = "python")\ .Deceased.sum(level = 0)\ .drop("Other State")\ .astype(int)\ .to_csv(data/"ka_cumulative_deaths_aug29.csv") # aggregate time series by hom_region df["detected_region"] = df.detected_district.map(hom_regions_rev) ka_ts = get_time_series(df.dropna(subset=["detected_region"]), "detected_region").rename(columns={ "Deceased": "dD", "Hospitalized": "dT", "Recovered": "dR" }).unstack(1).fillna(0).stack() cols = ["dD", "dT", "dR"] ka_ts_all = pd.concat([ka_ts, ka_ts[cols].cumsum().rename(columns = {col: col[1:] for col in cols})], axis = 1)\ .drop(columns = ["date", "time", "delta", "logdelta"])\