Ejemplo n.º 1
0
    'start_date': days_ago(0),
}

data_ingestion_dag = DAG(
    'covid_tracking_project_ingestion_dag',
    default_args=default_args,
    schedule_interval='@daily',  # Run once a day at midnight
    description='Ingestion configuration for Covid Tracking Project')

# Ingest to GCS
ctp_gcs_task_id = 'covid_tracking_project_to_gcs'
ctp_gcs_payload = util.generate_gcs_payload(_CTP_WORKFLOW_ID,
                                            filename=_CTP_GCS_FILENAME,
                                            url=_CTP_DOWNLOAD_URL)
ctp_gcs_operator = util.create_gcs_ingest_operator(ctp_gcs_task_id,
                                                   ctp_gcs_payload,
                                                   data_ingestion_dag)
ctp_gcs_short_op = util.create_gcs_short_circuit_operator(
    'did_ctp_files_download', ctp_gcs_task_id, data_ingestion_dag)

# Standardize and write to BQ
ctp_bq_payload = util.generate_bq_payload(_CTP_WORKFLOW_ID,
                                          _CTP_DATASET,
                                          filename=_CTP_GCS_FILENAME)
ctp_bq_op = util.create_bq_ingest_operator('ctp_standardize', ctp_bq_payload,
                                           data_ingestion_dag)

# Covid Tracking Project Ingestion DAG
# TODO(jenniebrown): Add the rest of the steps
(ctp_gcs_operator >> ctp_gcs_short_op >> ctp_bq_op)
Ejemplo n.º 2
0
data_ingestion_dag = DAG(
    'data_ingestion_dag',
    default_args=default_args,
    # TODO(https://github.com/SatcherInstitute/health-equity-tracker/issues/30)
    # schedule_interval='@daily',  # Run once a day at midnight
    description='The data ingestion pipeline.')

# CDC Covid Deaths
cdc_covid_deaths_task_id = 'cdc_covid_deaths_to_gcs'
cdc_covid_deaths_gcs_payload = util.generate_gcs_payload(
    _CDC_WORKFLOW_ID,
    filename=_CDC_GCS_FILENAME,
    url=_CDC_COVID_DEATHS_DOWNLOAD_URL)
cdc_covid_deaths_gcs_operator = util.create_gcs_ingest_operator(
    cdc_covid_deaths_task_id, cdc_covid_deaths_gcs_payload, data_ingestion_dag)
cdc_covid_deaths_gcs_short_op = util.create_gcs_short_circuit_operator(
    'did_cdc_covid_deaths_gcs_file_download', cdc_covid_deaths_task_id,
    data_ingestion_dag)
cdc_covid_deaths_bq_payload = util.generate_bq_payload(
    _CDC_WORKFLOW_ID, _CDC_DATASET_NAME, filename=_CDC_GCS_FILENAME)
cdc_covid_deaths_bq_operator = util.create_bq_ingest_operator(
    'cdc_covid_deaths_to_bq', cdc_covid_deaths_bq_payload, data_ingestion_dag)
cdc_covid_deaths_exporter_payload = {'dataset_name': _CDC_DATASET_NAME}
cdc_covid_deaths_exporter_operator = util.create_exporter_operator(
    'cdc_covid_deaths_exporter', cdc_covid_deaths_exporter_payload,
    data_ingestion_dag)

# Ingestion DAG
(cdc_covid_deaths_gcs_operator >> cdc_covid_deaths_gcs_short_op >>
 cdc_covid_deaths_bq_operator >> cdc_covid_deaths_exporter_operator)
Ejemplo n.º 3
0
    'acs_hhi_ingestion_dag',
    default_args=default_args,
    schedule_interval='@yearly',
    description='Ingestion configuration for ACS Household Income')

acs_hhi_gcs_task_id = 'acs_hhi_to_gcs'
acs_hhi_gcs_payload = util.generate_gcs_payload(_ACS_WORKFLOW_ID,
                                                url=_ACS_BASE_URL)
acs_hhi_gcs_operator = util.create_gcs_ingest_operator(acs_hhi_gcs_task_id,
                                                       acs_hhi_gcs_payload,
                                                       data_ingestion_dag)

acs_hhi_bq_payload = util.generate_bq_payload(_ACS_WORKFLOW_ID,
                                              _ACS_DATASET_NAME,
                                              url=_ACS_BASE_URL)
acs_hhi_bq_operator = util.create_bq_ingest_operator('acs_hhi_to_bq',
                                                     acs_hhi_bq_payload,
                                                     data_ingestion_dag)

acs_hhi_aggregator_payload = {'dataset_name': _ACS_DATASET_NAME}
acs_hhi_aggregator_operator = util.create_aggregator_operator(
    'acs_hhi_aggregator', acs_hhi_aggregator_payload, data_ingestion_dag)

acs_hhi_exporter_payload = {'dataset_name': _ACS_DATASET_NAME}
acs_hhi_exporter_operator = util.create_exporter_operator(
    'acs_hhi_exporter', acs_hhi_exporter_payload, data_ingestion_dag)

# Ingestion DAG
(acs_hhi_gcs_operator >> acs_hhi_bq_operator >> acs_hhi_aggregator_operator >>
 acs_hhi_exporter_operator)
    default_args=default_args,
    schedule_interval="@yearly",
    description="Ingestion configuration for ACS Health Insurance",
)

acs_hi_gcs_task_id = "acs_health_insurance_to_gcs"
acs_hi_gcs_payload = util.generate_gcs_payload(_ACS_WORKFLOW_ID,
                                               url=_ACS_BASE_URL)
acs_hi_gcs_operator = util.create_gcs_ingest_operator(acs_hi_gcs_task_id,
                                                      acs_hi_gcs_payload,
                                                      data_ingestion_dag)

acs_hi_bq_payload = util.generate_bq_payload(_ACS_WORKFLOW_ID,
                                             _ACS_DATASET_NAME,
                                             url=_ACS_BASE_URL)
acs_hi_bq_operator = util.create_bq_ingest_operator(
    "acs_health_insurance_to_bq", acs_hi_bq_payload, data_ingestion_dag)

acs_hi_aggregator_payload = {"dataset_name": _ACS_DATASET_NAME}
acs_hi_aggregator_operator = util.create_aggregator_operator(
    "acs_health_insurance_aggregator", acs_hi_aggregator_payload,
    data_ingestion_dag)

acs_hi_exporter_payload = {"dataset_name": _ACS_DATASET_NAME}
acs_hi_exporter_operator = util.create_exporter_operator(
    "acs_health_insurance_exporter", acs_hi_exporter_payload,
    data_ingestion_dag)

# Ingestion DAG
(acs_hi_gcs_operator >> acs_hi_bq_operator >> acs_hi_aggregator_operator >>
 acs_hi_exporter_operator)
Ejemplo n.º 5
0
_CDC_RESTRICTED_DATASET = 'cdc_restricted_data'

default_args = {'start_date': days_ago(0)}

data_ingestion_dag = DAG(
    'cdc_restricted_data_dag',
    default_args=default_args,
    description='Ingestion configuration for CDC Restricted Data')

# Standardize the CDC restricted data
cdc_bq_payload = util.generate_bq_payload(
    _CDC_RESTRICTED_WORKFLOW_ID,
    _CDC_RESTRICTED_DATASET,
    gcs_bucket=Variable.get('GCS_MANUAL_UPLOADS_BUCKET'),
    filename=_CDC_RESTRICTED_GCS_FILENAMES)
cdc_restricted_bq_op = util.create_bq_ingest_operator(
    'cdc_restricted_gcs_to_bq', cdc_bq_payload, data_ingestion_dag)

cdc_restricted_aggregator_payload = {'dataset_name': _CDC_RESTRICTED_DATASET}
cdc_restricted_aggregator_operator = util.create_aggregator_operator(
    'cdc_restricted_aggregator', cdc_restricted_aggregator_payload,
    data_ingestion_dag)

cdc_restricted_exporter_payload = {'dataset_name': _CDC_RESTRICTED_DATASET}
cdc_restricted_exporter_operator = util.create_exporter_operator(
    'cdc_restricted_exporter', cdc_restricted_exporter_payload,
    data_ingestion_dag)

# CDC Restricted Data Ingestion DAG
cdc_restricted_bq_op >> cdc_restricted_aggregator_operator >> cdc_restricted_exporter_operator
'''Manual data ingestion DAG.'''
from util import create_bq_ingest_operator
# Ingore the Airflow module, it is installed in both our dev and prod environments
from airflow.models import Variable  # type: ignore
from airflow import DAG  # type: ignore
from airflow.utils.dates import days_ago  # type: ignore

default_args = {
    'start_date': days_ago(0),
}

manual_ingestion_dag = DAG('manual_ingestion_dag',
                           default_args=default_args,
                           schedule_interval=None,
                           description='Triggering for manual uploads.')

# Manual Uplaods
manual_uploads_payload = {
    'message': {
        'is_airflow_run': True,
        'gcs_bucket': Variable.get('GCS_MANUAL_UPLOADS_BUCKET'),
        'id': 'MANUAL_UPLOADS'
    }
}
manual_uploads_bq_operator = create_bq_ingest_operator('manual_uploads_task',
                                                       manual_uploads_payload,
                                                       manual_ingestion_dag)
}

data_ingestion_dag = DAG(
    'acs_population_ingestion_dag',
    default_args=default_args,
    schedule_interval='@yearly',
    description='Ingestion configuration for ACS Population')

acs_pop_gcs_task_id = 'acs_population_to_gcs'
acs_pop_gcs_payload = util.generate_gcs_payload(
    _ACS_WORKFLOW_ID, url=_ACS_BASE_URL)
acs_pop_gcs_operator = util.create_gcs_ingest_operator(
    acs_pop_gcs_task_id, acs_pop_gcs_payload, data_ingestion_dag)

acs_pop_bq_payload = util.generate_bq_payload(
    _ACS_WORKFLOW_ID, _ACS_DATASET_NAME, url=_ACS_BASE_URL)
acs_pop_bq_operator = util.create_bq_ingest_operator(
    'acs_population_to_bq', acs_pop_bq_payload, data_ingestion_dag)

acs_pop_aggregator_payload = {'dataset_name': _ACS_DATASET_NAME}
acs_pop_aggregator_operator = util.create_aggregator_operator(
    'acs_population_aggregator', acs_pop_aggregator_payload, data_ingestion_dag)

acs_pop_exporter_payload = {'dataset_name': _ACS_DATASET_NAME}
acs_pop_exporter_operator = util.create_exporter_operator(
    'acs_population_exporter', acs_pop_exporter_payload, data_ingestion_dag)

# Ingestion DAG
(acs_pop_gcs_operator >> acs_pop_bq_operator >>
 acs_pop_aggregator_operator >> acs_pop_exporter_operator)
Ejemplo n.º 8
0
import util

_UHC_WORKFLOW_ID = 'UHC_DATA'
_UHC_DATASET_NAME = 'uhc_data'

default_args = {
    'start_date': days_ago(0),
}

data_ingestion_dag = DAG('uhc_ingestion_dag',
                         default_args=default_args,
                         schedule_interval=None,
                         description='Ingestion configuration for UHC')

uhc_bq_payload = util.generate_bq_payload(_UHC_WORKFLOW_ID, _UHC_DATASET_NAME)
uhc_pop_bq_operator = util.create_bq_ingest_operator('uhc_to_bq',
                                                     uhc_bq_payload,
                                                     data_ingestion_dag)

uhc_aggregator_payload = {'dataset_name': _UHC_DATASET_NAME}
uhc_aggregator_operator = util.create_aggregator_operator(
    'uhc_aggregator', uhc_aggregator_payload, data_ingestion_dag)

uhc_exporter_payload = {'dataset_name': _UHC_DATASET_NAME}
uhc_exporter_operator = util.create_exporter_operator('uhc_exporter',
                                                      uhc_exporter_payload,
                                                      data_ingestion_dag)

# Ingestion DAG
uhc_pop_bq_operator >> uhc_aggregator_operator >> uhc_exporter_operator