def load_national_timeseries(download: bool = False) -> pd.DataFrame: print(":: loading case timeseries data") if download: download_data(data, 'timeseries.json', "https://api.covid19india.org/v3/") with (data / 'timeseries.json').open("rb") as fp: df = flat_table.normalize(pd.read_json(fp)).fillna(0) df.columns = df.columns.str.split('.', expand=True) dates = np.squeeze(df["index"][None].values) return df.drop(columns="index", level=0).set_index(dates).stack([1, 2]).drop("UN", axis=1)
def get_state_timeseries(states=["Tamil Nadu"], download: bool = False) -> pd.DataFrame: paths = { "v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in range(3, 25)] } if download: for target in paths['v3'] + paths['v4']: download_data(data, target) return load_all_data(v3_paths = [data/filepath for filepath in paths['v3']], v4_paths = [data/filepath for filepath in paths['v4']])\ .query("detected_state in @states" if states != "*" else "detected_state != 'NULL'", engine = "python")\ .pipe(lambda _: get_time_series(_, ["detected_state", "detected_district"]))\ .drop(columns = ["date", "time", "delta", "logdelta"])\ .rename(columns = { "Deceased": "dD", "Hospitalized": "dT", "Recovered": "dR" })
def run_download(_): run_date = pd.Timestamp.now().strftime("%d-%m-%Y") print(f"Starting download of API files on {run_date}") # set up root = Path("/tmp") data = mkdir(root/"data") # download aggregated CSVs as well download_data(data, "states.csv") download_data(data, "districts.csv") print("Uploading time series to storage bucket.") bucket = storage.Client().bucket(bucket_name) bucket.blob("pipeline/raw/districts.csv")\ .upload_from_filename(str(data/"districts.csv"), content_type = "text/csv") bucket.blob("pipeline/raw/states.csv")\ .upload_from_filename(str(data/"states.csv"), content_type = "text/csv") return 'OK!'
def get_state_timeseries( states = "*", download: bool = False, aggregation_cols = ["detected_state", "detected_district"], last_API_file: int = 27) -> pd.DataFrame: """ load state- and district-level data, downloading source files if specified """ paths = {"v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in range(3, last_API_file)]} if download: for target in paths['v3'] + paths['v4']: download_data(data, target) return load_all_data(v3_paths = [data/filepath for filepath in paths['v3']], v4_paths = [data/filepath for filepath in paths['v4']])\ .query("detected_state in @states" if states != "*" else "detected_state != 'NULL'")\ .pipe(lambda _: get_time_series(_, aggregation_cols))\ .drop(columns = ["date", "time", "delta", "logdelta"])\ .rename(columns = { "Deceased": "dD", "Hospitalized": "dT", "Recovered": "dR" })
import pandas as pd from epimargin.etl.commons import download_data from epimargin.etl.covid19india import data_path, get_time_series, load_all_data from epimargin.utils import setup data, _ = setup() paths = { "v3": [data_path(i) for i in (1, 2)], "v4": [data_path(i) for i in range(3, 18)] } for target in paths['v3'] + paths['v4']: download_data(data, target) df = load_all_data(v3_paths=[data / filepath for filepath in paths['v3']], v4_paths=[data / filepath for filepath in paths['v4']]) schema = {"Deceased": "dD", "Recovered": "dR", "Hospitalized": "dT"} def assemble_time_series(df): ts = get_time_series(df) deltas = ts[schema.keys()]\ .rename(columns = schema) deltas = deltas.reindex(pd.date_range(deltas.index.min(), deltas.index.max()), fill_value=0) merged = deltas.merge(deltas.cumsum(axis=0).rename(columns=lambda _: _[1]), left_index=True, right_index=True).astype(int)
"India", stringency = stringency) plt.PlotDevice()\ .title("\nIndia: Mobility & Lockdown Trends")\ .annotate("Google Mobility Data (baseline mobility measured from Jan 3 - Feb 6, 2020) + Oxford COVID Policy Tracker") plt.show() # mobility vs cases from pathlib import Path import flat_table from epimargin.etl.commons import download_data data = Path("./data") download_data(data, 'timeseries.json', "https://api.covid19india.org/v3/") # data prep with (data/'timeseries.json').open("rb") as fp: df = flat_table.normalize(pd.read_json(fp)).fillna(0) df.columns = df.columns.str.split('.', expand = True) dates = np.squeeze(df["index"][None].values) df = df.drop(columns = "index").set_index(dates).stack([1, 2]).drop("UN", axis = 1) series = mobility[mobility.sub_region_1.isna()] plt.plot(series.date, smoothed(series.retail_and_recreation_percent_change_from_baseline), label = "Retail/Recreation") plt.fill_betweenx((-100, 60), pd.to_datetime("March 24, 2020"), pd.to_datetime("June 1, 2020"), color = "black", alpha = 0.05, zorder = -1) plt.text(s = "national lockdown", x = pd.to_datetime("April 27, 2020"), y = -20, fontdict = plt.note_font, ha = "center", va = "top") plt.ylim(-100, 10) plt.xlim(series.date.min(), series.date.max()) plt.legend(loc = 'upper right')
smoothing=smooth, totals=True) return pd.DataFrame( data={ "date": estimates[0], "Rt": estimates[1], "Rt_upper": estimates[2], "Rt_lower": estimates[3], "total_cases": estimates[-4][2:], "new_cases": estimates[-3], }) data, figs = setup() download_data(data, 'timeseries.json', "https://api.covid19india.org/v3/") download_data(data, 'state_wise.csv', "https://api.covid19india.org/v3/") download_data(data, 'states.csv', "https://api.covid19india.org/v3/") download_data(data, 'districts.csv', "https://api.covid19india.org/v3/") # data prep with (data / 'timeseries.json').open("rb") as fp: df = flat_table.normalize(pd.read_json(fp)).fillna(0) df.columns = df.columns.str.split('.', expand=True) dates = np.squeeze(df["index"][None].values) df = df.drop(columns = "index")\ .set_index(dates)\ .stack([1, 2])\ .drop("UN", axis = 1)\ .fillna(0)
)\ .fit()\ .predict([1, julian_dates[-1] + period])[0] # set to cloud temp directory if not explicitly told to run locally root = cwd() if len(sys.argv) > 1 and sys.argv[1] == "--local" else Path( "/tmp") data = root / "data" # model details gamma = 0.2 smoothing = 10 CI = 0.95 download_data(data, 'state_wise_daily.csv') state_df = load_statewise_data(data / "state_wise_daily.csv") country_time_series = get_time_series(state_df) estimates = [] timeseries = [] # country level (dates, RR_pred, RR_CI_upper, RR_CI_lower, T_pred, T_CI_upper, T_CI_lower, total_cases, new_cases_ts, anomalies, anomaly_dates) = analytical_MPVS( country_time_series["Hospitalized"].iloc[:-1], CI=CI, smoothing=notched_smoothing(window=smoothing)) country_code = state_name_lookup["India"]
top_level = json.load(fp) df = pd.DataFrame([(_[date], _[total_cases]) for _ in top_level[timeseries]], columns=["date", "total_cases"]) df["date"] = (date_scale * df["date"]).apply(pd.Timestamp) df.set_index("date", inplace=True) if start_date: return df[df.index >= start_date] return df (data, figs) = setup(level="INFO") for province in provinces: logger.info("downloading data for %s", province) download_data(data, filename(province), base_url="https://data.covid19.go.id/public/api/") province_cases = { province: load_province_timeseries(data, province, "Apr 1, 2020") for province in provinces } bgn = min(cases.index.min() for cases in province_cases.values()) end = max(cases.index.max() for cases in province_cases.values()) idx = pd.date_range(bgn, end) province_cases = { province: cases.reindex(idx, method="pad").fillna(0) for (province, cases) in province_cases.items() } prediction_period = 14 * days
def load_vax_data(download = False): if download: download_data(data, "vaccine_doses_statewise.csv") vax = pd.read_csv(data/"vaccine_doses_statewise.csv").set_index("State").T vax.columns = vax.columns.str.title() return vax.set_index(pd.to_datetime(vax.index, format = "%d/%m/%Y"))