import pathlib from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.python_operator import PythonOperator import dagmod # constants YY_URL: str = ( "https://raw.githubusercontent.com/youyanggu/" + "covid19_projections/master/projections/combined/latest_subregion.csv") PATH0: pathlib.PosixPath = pathlib.PosixPath( "../../extern/data/epidemiological/us/forecasts/YYG/county") WFORMAT: str = ".csv" FILENAME: str = "YYG_county_us_casesdeathsprojR" # build path path: pathlib.PosixPath = PATH0.joinpath(FILENAME + WFORMAT) # define operators and dag dag: DAG = dagmod.create_dag("YYG2datapull", "daily YYG2 data pull") date_task: BashOperator = dagmod.get_date_op(dag) pull_task: PythonOperator = dagmod.get_pull_op("YYG2countypull", dagmod.rw_all, [WFORMAT, [path], [YY_URL]], dag) date_task >> pull_task
AGGS: List[str] = ["states", "counties"] INTERVENTIONS: List[str] = [ "NO_INTERVENTION", "WEAK_INTERVENTION", "STRONG_INTERVENTION", "OBSERVED_INTERVENTION", ] # build filepaths and urls for all pairs of agg. and intervention paths: List[Path] = [] urls: List[Path] = [] for agg in AGGS: for intervention in INTERVENTIONS: url: Path = Path(URL0 + agg + "." + intervention + URL1) urls.append(url) filename: str = "can_" + agg + "_" + intervention + WFORMAT path: Path = PATH0.joinpath(filename) paths.append(path) # define operators and dag dag: DAG = dagmod.create_dag("CANdatapull", "Covid act now data pull") date_task: BashOperator = dagmod.get_date_op(dag) pull_task: PythonOperator = dagmod.get_pull_op("CANdatapull", dagmod.rw_all, [WFORMAT, paths, urls], dag) date_task >> pull_task
"""DAG for pulling the UW state policy data.""" import pathlib from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.python_operator import PythonOperator import dagmod # constants UWP_URL: str = "https://query.data.world/s/vfpfdftmwmk3qj7fbobnrpas5kxuj2" PATH0: pathlib.PosixPath = pathlib.PosixPath( "../../extern/data/epidemiological/us/policy") WFORMAT: str = ".csv" FILENAME: str = "UW_state_us_policy" # build path path: pathlib.PosixPath = PATH0.joinpath(FILENAME + WFORMAT) # define operators and dag dag: DAG = dagmod.create_dag("UWPdatapull", "daily UW policy data pull") date_task: BashOperator = dagmod.get_date_op(dag) pull_task: PythonOperator = dagmod.get_pull_op("UWpolicypull", dagmod.rw_all, [WFORMAT, [path], [UWP_URL]], dag=dag) date_task >> pull_task
"G0Zg3wlgJpB2Zvg-vEN1i_76n2I-djL0Dk/export?format=csv&id") PATH0: pathlib.PosixPath = pathlib.PosixPath( "../../extern/data/epidemiological/us/forecasts/Yu/county") FILENAME: str = "YU_county_us_deathsproj" def rw_all(format: str = ".csv"): """Grab, read, and write data.""" # grab csv from url and convert to dataframe df: pd.DataFrame = pd.DataFrame(index=[], columns=[]) try: df = pd.read_csv(YU_URL, header=2) except Exception as e: dagmod.bad_url(YU_URL, e) # set path and write dataframe in desired format path: pathlib.PosixPath = PATH0.joinpath(FILENAME + format) dagmod.rw(format, path, df) # define operators and dag dag: DAG = dagmod.create_dag("YUdatapull", "daily Yu Group data pull") date_task: BashOperator = dagmod.get_date_op(dag) pull_task: PythonOperator = dagmod.get_pull_op("YUcountypull", rw_all, [], dag=dag) date_task >> pull_task
"""DAG for pulling WHO Covid data.""" import pathlib from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.python_operator import PythonOperator import dagmod # constants WHO_URL: str = "https://covid19.who.int/WHO-COVID-19-global-data.csv" PATH0: pathlib.PosixPath = pathlib.PosixPath( "../../extern/data/epidemiological/global" ) WFORMAT: str = ".csv" FILENAME: str = "WHO-COVID-19-global-data" # build path path: pathlib.PosixPath = PATH0.joinpath(FILENAME + WFORMAT) # define operators and dag dag: DAG = dagmod.create_dag("WHOdatapull", "daily WHO data pull") date_task: BashOperator = dagmod.get_date_op(dag) pull_task: PythonOperator = dagmod.get_pull_op( "WHOdatapull", dagmod.rw_all, [WFORMAT, [path], [WHO_URL]], dag ) date_task >> pull_task
# build paths for writing paths: List[str] = build_paths("state", state) # write data paths = [path + format for path in paths] if format == ".csv": cases.to_csv(paths[0], mode="w") deaths.to_csv(paths[1], mode="w") hospital.to_csv(paths[2], mode="w") else: # only other format is .h5 cases.to_hdf(paths[0], key="df", mode="w") deaths.to_hdf(paths[1], key="df", mode="w") hospital.to_hdf(paths[2], key="df", mode="w") def pull_mit(): """Callable for python op.""" rw_cases_deaths_hosp(URL) # define operators and dag dag: DAG = dagmod.create_dag("MITdatapull", "daily MIT data pull (states)") pull_states_task: PythonOperator = dagmod.get_pull_op( "MITstatespull", pull_mit, [], dag ) date_task: BashOperator = dagmod.get_date_op(dag) date_task >> pull_states_task
"deaths_by_age_county", "datadict", ] PATH0: pathlib.Path = pathlib.Path("../../extern/data/epidemiological/WA") def rw_all(format: str = ".csv"): """Grab, read, and write data.""" for i in range(4): # grab csv from url and convert to dataframe df: pd.DataFrame = pd.DataFrame(index=[], columns=[]) try: df = pd.read_excel(URL, sheet_name=i) except Exception as e: dagmod.bad_url(URL, e) # set path and write dataframe in desired format path: pathlib.Path = PATH0.joinpath(NAMES[i] + format) dagmod.rw(format, path, df) # define operators and dag dag: DAG = dagmod.create_dag("WADHdatapull", "weekly WADH data pull") date_task: BashOperator = dagmod.get_date_op(dag) pull_task: PythonOperator = dagmod.get_pull_op("WADHdatapull", rw_all, [], dag) date_task >> pull_task
# write each csv to /IHME dir in the desired format for member in z.namelist(): # skip if member file is not a csv if ".csv" not in member: continue with z.open(member) as file: # read in data df = pd.DataFrame() try: df = pd.read_csv(file) except Exception as e: print("Exception" + str(e) + " with file " + str(member)) # build path and write file in desired format s: slice = slice(start=member.index("/") + 1, stop=-1) path: Path = PATH0.joinpath(member[s]) dagmod.rw(format, path, df) dag: DAG = dagmod.create_dag("IHMEdatapull", "daily IHME data pull") date_task: BashOperator = dagmod.get_date_op(dag) pull_task: PythonOperator = dagmod.get_pull_op("IHMEallpull", rw_zip, [], dag) date_task >> pull_task
def pull_us(): """Pull country data.""" # prepare data url and filepaths url: str = YY_URL + "/US.csv" paths: List[str] = build_paths(agg="country", loc="US") # grab data and write to files rw_cases_deaths_R(url, paths, format=".csv") def pull_states(): """Pull all states data.""" # states data for state in STATES: pull_state(state) # define operators and dag dag: DAG = dagmod.create_dag("YYGdatapull", "daily YYG data pull (states + us)") pull_states_task: PythonOperator = dagmod.get_pull_op("YYGstatespull", pull_states, [], dag) pull_us_task: PythonOperator = dagmod.get_pull_op("YYGuspull", pull_us, [], dag) date_task: BashOperator = dagmod.get_date_op(dag) date_task >> pull_us_task >> pull_states_task