'start_date': days_ago(0), } data_ingestion_dag = DAG( 'covid_tracking_project_ingestion_dag', default_args=default_args, schedule_interval='@daily', # Run once a day at midnight description='Ingestion configuration for Covid Tracking Project') # Ingest to GCS ctp_gcs_task_id = 'covid_tracking_project_to_gcs' ctp_gcs_payload = util.generate_gcs_payload(_CTP_WORKFLOW_ID, filename=_CTP_GCS_FILENAME, url=_CTP_DOWNLOAD_URL) ctp_gcs_operator = util.create_gcs_ingest_operator(ctp_gcs_task_id, ctp_gcs_payload, data_ingestion_dag) ctp_gcs_short_op = util.create_gcs_short_circuit_operator( 'did_ctp_files_download', ctp_gcs_task_id, data_ingestion_dag) # Standardize and write to BQ ctp_bq_payload = util.generate_bq_payload(_CTP_WORKFLOW_ID, _CTP_DATASET, filename=_CTP_GCS_FILENAME) ctp_bq_op = util.create_bq_ingest_operator('ctp_standardize', ctp_bq_payload, data_ingestion_dag) # Covid Tracking Project Ingestion DAG # TODO(jenniebrown): Add the rest of the steps (ctp_gcs_operator >> ctp_gcs_short_op >> ctp_bq_op)
data_ingestion_dag = DAG( 'data_ingestion_dag', default_args=default_args, # TODO(https://github.com/SatcherInstitute/health-equity-tracker/issues/30) # schedule_interval='@daily', # Run once a day at midnight description='The data ingestion pipeline.') # CDC Covid Deaths cdc_covid_deaths_task_id = 'cdc_covid_deaths_to_gcs' cdc_covid_deaths_gcs_payload = util.generate_gcs_payload( _CDC_WORKFLOW_ID, filename=_CDC_GCS_FILENAME, url=_CDC_COVID_DEATHS_DOWNLOAD_URL) cdc_covid_deaths_gcs_operator = util.create_gcs_ingest_operator( cdc_covid_deaths_task_id, cdc_covid_deaths_gcs_payload, data_ingestion_dag) cdc_covid_deaths_gcs_short_op = util.create_gcs_short_circuit_operator( 'did_cdc_covid_deaths_gcs_file_download', cdc_covid_deaths_task_id, data_ingestion_dag) cdc_covid_deaths_bq_payload = util.generate_bq_payload( _CDC_WORKFLOW_ID, _CDC_DATASET_NAME, filename=_CDC_GCS_FILENAME) cdc_covid_deaths_bq_operator = util.create_bq_ingest_operator( 'cdc_covid_deaths_to_bq', cdc_covid_deaths_bq_payload, data_ingestion_dag) cdc_covid_deaths_exporter_payload = {'dataset_name': _CDC_DATASET_NAME} cdc_covid_deaths_exporter_operator = util.create_exporter_operator( 'cdc_covid_deaths_exporter', cdc_covid_deaths_exporter_payload, data_ingestion_dag) # Ingestion DAG (cdc_covid_deaths_gcs_operator >> cdc_covid_deaths_gcs_short_op >> cdc_covid_deaths_bq_operator >> cdc_covid_deaths_exporter_operator)
'acs_hhi_ingestion_dag', default_args=default_args, schedule_interval='@yearly', description='Ingestion configuration for ACS Household Income') acs_hhi_gcs_task_id = 'acs_hhi_to_gcs' acs_hhi_gcs_payload = util.generate_gcs_payload(_ACS_WORKFLOW_ID, url=_ACS_BASE_URL) acs_hhi_gcs_operator = util.create_gcs_ingest_operator(acs_hhi_gcs_task_id, acs_hhi_gcs_payload, data_ingestion_dag) acs_hhi_bq_payload = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, url=_ACS_BASE_URL) acs_hhi_bq_operator = util.create_bq_ingest_operator('acs_hhi_to_bq', acs_hhi_bq_payload, data_ingestion_dag) acs_hhi_aggregator_payload = {'dataset_name': _ACS_DATASET_NAME} acs_hhi_aggregator_operator = util.create_aggregator_operator( 'acs_hhi_aggregator', acs_hhi_aggregator_payload, data_ingestion_dag) acs_hhi_exporter_payload = {'dataset_name': _ACS_DATASET_NAME} acs_hhi_exporter_operator = util.create_exporter_operator( 'acs_hhi_exporter', acs_hhi_exporter_payload, data_ingestion_dag) # Ingestion DAG (acs_hhi_gcs_operator >> acs_hhi_bq_operator >> acs_hhi_aggregator_operator >> acs_hhi_exporter_operator)
default_args=default_args, schedule_interval="@yearly", description="Ingestion configuration for ACS Health Insurance", ) acs_hi_gcs_task_id = "acs_health_insurance_to_gcs" acs_hi_gcs_payload = util.generate_gcs_payload(_ACS_WORKFLOW_ID, url=_ACS_BASE_URL) acs_hi_gcs_operator = util.create_gcs_ingest_operator(acs_hi_gcs_task_id, acs_hi_gcs_payload, data_ingestion_dag) acs_hi_bq_payload = util.generate_bq_payload(_ACS_WORKFLOW_ID, _ACS_DATASET_NAME, url=_ACS_BASE_URL) acs_hi_bq_operator = util.create_bq_ingest_operator( "acs_health_insurance_to_bq", acs_hi_bq_payload, data_ingestion_dag) acs_hi_aggregator_payload = {"dataset_name": _ACS_DATASET_NAME} acs_hi_aggregator_operator = util.create_aggregator_operator( "acs_health_insurance_aggregator", acs_hi_aggregator_payload, data_ingestion_dag) acs_hi_exporter_payload = {"dataset_name": _ACS_DATASET_NAME} acs_hi_exporter_operator = util.create_exporter_operator( "acs_health_insurance_exporter", acs_hi_exporter_payload, data_ingestion_dag) # Ingestion DAG (acs_hi_gcs_operator >> acs_hi_bq_operator >> acs_hi_aggregator_operator >> acs_hi_exporter_operator)
_CDC_RESTRICTED_DATASET = 'cdc_restricted_data' default_args = {'start_date': days_ago(0)} data_ingestion_dag = DAG( 'cdc_restricted_data_dag', default_args=default_args, description='Ingestion configuration for CDC Restricted Data') # Standardize the CDC restricted data cdc_bq_payload = util.generate_bq_payload( _CDC_RESTRICTED_WORKFLOW_ID, _CDC_RESTRICTED_DATASET, gcs_bucket=Variable.get('GCS_MANUAL_UPLOADS_BUCKET'), filename=_CDC_RESTRICTED_GCS_FILENAMES) cdc_restricted_bq_op = util.create_bq_ingest_operator( 'cdc_restricted_gcs_to_bq', cdc_bq_payload, data_ingestion_dag) cdc_restricted_aggregator_payload = {'dataset_name': _CDC_RESTRICTED_DATASET} cdc_restricted_aggregator_operator = util.create_aggregator_operator( 'cdc_restricted_aggregator', cdc_restricted_aggregator_payload, data_ingestion_dag) cdc_restricted_exporter_payload = {'dataset_name': _CDC_RESTRICTED_DATASET} cdc_restricted_exporter_operator = util.create_exporter_operator( 'cdc_restricted_exporter', cdc_restricted_exporter_payload, data_ingestion_dag) # CDC Restricted Data Ingestion DAG cdc_restricted_bq_op >> cdc_restricted_aggregator_operator >> cdc_restricted_exporter_operator
'''Manual data ingestion DAG.''' from util import create_bq_ingest_operator # Ingore the Airflow module, it is installed in both our dev and prod environments from airflow.models import Variable # type: ignore from airflow import DAG # type: ignore from airflow.utils.dates import days_ago # type: ignore default_args = { 'start_date': days_ago(0), } manual_ingestion_dag = DAG('manual_ingestion_dag', default_args=default_args, schedule_interval=None, description='Triggering for manual uploads.') # Manual Uplaods manual_uploads_payload = { 'message': { 'is_airflow_run': True, 'gcs_bucket': Variable.get('GCS_MANUAL_UPLOADS_BUCKET'), 'id': 'MANUAL_UPLOADS' } } manual_uploads_bq_operator = create_bq_ingest_operator('manual_uploads_task', manual_uploads_payload, manual_ingestion_dag)
} data_ingestion_dag = DAG( 'acs_population_ingestion_dag', default_args=default_args, schedule_interval='@yearly', description='Ingestion configuration for ACS Population') acs_pop_gcs_task_id = 'acs_population_to_gcs' acs_pop_gcs_payload = util.generate_gcs_payload( _ACS_WORKFLOW_ID, url=_ACS_BASE_URL) acs_pop_gcs_operator = util.create_gcs_ingest_operator( acs_pop_gcs_task_id, acs_pop_gcs_payload, data_ingestion_dag) acs_pop_bq_payload = util.generate_bq_payload( _ACS_WORKFLOW_ID, _ACS_DATASET_NAME, url=_ACS_BASE_URL) acs_pop_bq_operator = util.create_bq_ingest_operator( 'acs_population_to_bq', acs_pop_bq_payload, data_ingestion_dag) acs_pop_aggregator_payload = {'dataset_name': _ACS_DATASET_NAME} acs_pop_aggregator_operator = util.create_aggregator_operator( 'acs_population_aggregator', acs_pop_aggregator_payload, data_ingestion_dag) acs_pop_exporter_payload = {'dataset_name': _ACS_DATASET_NAME} acs_pop_exporter_operator = util.create_exporter_operator( 'acs_population_exporter', acs_pop_exporter_payload, data_ingestion_dag) # Ingestion DAG (acs_pop_gcs_operator >> acs_pop_bq_operator >> acs_pop_aggregator_operator >> acs_pop_exporter_operator)
import util _UHC_WORKFLOW_ID = 'UHC_DATA' _UHC_DATASET_NAME = 'uhc_data' default_args = { 'start_date': days_ago(0), } data_ingestion_dag = DAG('uhc_ingestion_dag', default_args=default_args, schedule_interval=None, description='Ingestion configuration for UHC') uhc_bq_payload = util.generate_bq_payload(_UHC_WORKFLOW_ID, _UHC_DATASET_NAME) uhc_pop_bq_operator = util.create_bq_ingest_operator('uhc_to_bq', uhc_bq_payload, data_ingestion_dag) uhc_aggregator_payload = {'dataset_name': _UHC_DATASET_NAME} uhc_aggregator_operator = util.create_aggregator_operator( 'uhc_aggregator', uhc_aggregator_payload, data_ingestion_dag) uhc_exporter_payload = {'dataset_name': _UHC_DATASET_NAME} uhc_exporter_operator = util.create_exporter_operator('uhc_exporter', uhc_exporter_payload, data_ingestion_dag) # Ingestion DAG uhc_pop_bq_operator >> uhc_aggregator_operator >> uhc_exporter_operator