def load(self, df: pd.DataFrame, output_path: str) -> None: # Export data if output_path.startswith("s3://"): obj_to_s3(df, s3_path=output_path, public=False) # df, output_path, public=True) else: df.to_csv(output_path, index=False)
def create_subnational(): global_cases = clean_global_subnational("confirmed") global_deaths = clean_global_subnational("deaths") us_cases = clean_us_subnational("confirmed") us_deaths = clean_us_subnational("deaths") df = pd.concat([ pd.merge(global_cases, global_deaths, on=["location1", "location2", "location3", "date"], how="outer"), pd.merge(us_cases, us_deaths, on=["location1", "location2", "location3", "date"], how="outer"), ]).sort_values(["location1", "location2", "location3", "date"])[[ "location1", "location2", "location3", "date", "total_cases", "new_cases", "new_cases_smoothed", "total_deaths", "new_deaths", "new_deaths_smoothed", ]] df = df[df.total_cases > 0] filename = "subnational_cases_deaths" compression = {"method": "zip", "archive_name": f"{filename}.csv"} # df.to_csv(os.path.join(OUTPUT_PATH, f"{filename}.zip"), index=False, compression=compression) obj_to_s3(df, s3_path="s3://covid-19/public/jhu/{filename}.zip", compression=compression, public=True)
def create_latest(df): """Export dataset as CSV, XLSX and JSON (latest data points).""" df = df[df.date >= str(date.today() - timedelta(weeks=2))] df = df.sort_values("date") latest = [ df[df.location == loc].ffill().tail(1).round(3) for loc in set(df.location) ] latest = pd.concat(latest) latest = latest.sort_values("location").rename( columns={"date": "last_updated_date"}) print("Writing latest version…") # CSV latest.to_csv(os.path.join(DATA_DIR, "latest", "owid-covid-latest.csv"), index=False) S3().upload_to_s3( os.path.join(DATA_DIR, "latest", "owid-covid-latest.csv"), "s3://covid-19/public/latest/owid-covid-latest.csv", public=True, ) # XLSX obj_to_s3(latest, s3_path="s3://covid-19/public/latest/owid-covid-latest.xlsx", public=True) # JSON latest.dropna(subset=["iso_code"]).set_index("iso_code").to_json( os.path.join(DATA_DIR, "latest", "owid-covid-latest.json"), orient="index") S3().upload_to_s3( os.path.join(DATA_DIR, "latest", "owid-covid-latest.json"), "s3://covid-19/public/latest/owid-covid-latest.json", public=True, )
def _export_log_info(df_exec, t_sec_1, t_sec_2): # print(len(df_new), len(MODULES_NAME), len(df_new) == len(MODULES_NAME)) if len(df_exec) == len(MODULES_NAME): print("EXPORTING LOG DETAILS") details = system_details() date_now = localdate(force_today=True) machine = details["id"] # Export timings per country df_exec = df_exec.reset_index().assign(date=date_now, machine=machine) df = obj_from_s3(LOG_GET_COUNTRIES) df = df[df.date + df.machine != date_now + machine] df = pd.concat([df, df_exec]) obj_to_s3(df, LOG_GET_COUNTRIES) # Export machine info data = obj_from_s3(LOG_MACHINES) if machine not in data: data = {**details, machine: details["info"]} obj_to_s3(data, LOG_MACHINES) # Export overall timing report = { "machine": machine, "date": date_now, "t_sec": t_sec_1, "t_sec_retry": t_sec_2 } df_new = pd.DataFrame([report]) df = obj_from_s3(LOG_GET_GLOBAL) df = df[df.date + df.machine != date_now + machine] df = pd.concat([df, df_new]) obj_to_s3(df, LOG_GET_GLOBAL)
def create_dataset(df, macro_variables): """Export dataset as CSV, XLSX and JSON (complete time series).""" print("Writing to CSV…") filename = os.path.join(DATA_DIR, "owid-covid-data.csv") df.to_csv(filename, index=False) S3().upload_to_s3(filename, "s3://covid-19/public/owid-covid-data.csv", public=True) print("Writing to XLSX…") # filename = os.path.join(DATA_DIR, "owid-covid-data.xlsx") # all_covid.to_excel(os.path.join(DATA_DIR, "owid-covid-data.xlsx"), index=False, engine="xlsxwriter") # upload_to_s3(filename, "public/owid-covid-data.xlsx", public=True) obj_to_s3(df, s3_path="s3://covid-19/public/owid-covid-data.xlsx", public=True) print("Writing to JSON…") data = df_to_dict( df, macro_variables.keys(), valid_json=True, ) obj_to_s3(data, "s3://covid-19/public/owid-covid-data.json", public=True)
def main(): for country in countries: logger.info(f"VAX - ICE - {country.location}") df = country.read() obj_to_s3(df, f"{PATH_ICE}/{country.location}.csv")