def test_execute_unicode_logs(self, client_class_mock):
        client_mock = mock.Mock(spec=APIClient)
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.create_host_config.return_value = mock.Mock()
        client_mock.images.return_value = []
        client_mock.attach.return_value = ['unicode container log 😁']
        client_mock.pull.return_value = []
        client_mock.wait.return_value = {"StatusCode": 0}

        client_class_mock.return_value = client_mock

        originalRaiseExceptions = logging.raiseExceptions  # pylint: disable=invalid-name
        logging.raiseExceptions = True

        operator = DockerOperator(image='ubuntu',
                                  owner='unittest',
                                  task_id='unittest')

        with mock.patch('traceback.print_exception') as print_exception_mock:
            operator.execute(None)
            logging.raiseExceptions = originalRaiseExceptions
            print_exception_mock.assert_not_called()
    def test_execute_with_docker_conn_id_use_hook(self, operator_client_mock,
                                                  operator_docker_hook):
        # Mock out a Docker client, so operations don't raise errors
        client_mock = mock.Mock(name='DockerOperator.APIClient mock', spec=APIClient)
        client_mock.images.return_value = []
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.attach.return_value = []
        client_mock.pull.return_value = []
        client_mock.wait.return_value = {"StatusCode": 0}
        operator_client_mock.return_value = client_mock

        # Create the DockerOperator
        operator = DockerOperator(
            image='publicregistry/someimage',
            owner='unittest',
            task_id='unittest',
            docker_conn_id='some_conn_id'
        )

        # Mock out the DockerHook
        hook_mock = mock.Mock(name='DockerHook mock', spec=DockerHook)
        hook_mock.get_conn.return_value = client_mock
        operator_docker_hook.return_value = hook_mock

        operator.execute(None)

        self.assertEqual(
            operator_client_mock.call_count, 0,
            'Client was called on the operator instead of the hook'
        )
        self.assertEqual(
            operator_docker_hook.call_count, 1,
            'Hook was not called although docker_conn_id configured'
        )
        self.assertEqual(
            client_mock.pull.call_count, 1,
            'Image was not pulled using operator client'
        )
Example #3
0
    def test_execute(self, client_class_mock, mkdtemp_mock):
        host_config = mock.Mock()
        mkdtemp_mock.return_value = '/mkdtemp'

        client_mock = mock.Mock(spec=Client)
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.create_host_config.return_value = host_config
        client_mock.images.return_value = []
        client_mock.logs.return_value = ['container log']
        client_mock.pull.return_value = [b'{"status":"pull log"}']
        client_mock.wait.return_value = 0

        client_class_mock.return_value = client_mock

        operator = DockerOperator(api_version='1.19', command='env', environment={'UNIT': 'TEST'},
                                  image='ubuntu:latest', network_mode='bridge', owner='unittest',
                                  task_id='unittest', volumes=['/host/path:/container/path'])
        operator.execute(None)

        client_class_mock.assert_called_with(base_url='unix://var/run/docker.sock', tls=None,
                                             version='1.19')

        client_mock.create_container.assert_called_with(command='env', cpu_shares=1024,
                                                        environment={
                                                            'AIRFLOW_TMP_DIR': '/tmp/airflow',
                                                            'UNIT': 'TEST'
                                                        },
                                                        host_config=host_config,
                                                        image='ubuntu:latest',
                                                        mem_limit=None, user=None)
        client_mock.create_host_config.assert_called_with(binds=['/host/path:/container/path',
                                                                 '/mkdtemp:/tmp/airflow'],
                                                          network_mode='bridge')
        client_mock.images.assert_called_with(name='ubuntu:latest')
        client_mock.logs.assert_called_with(container='some_id', stream=True)
        client_mock.pull.assert_called_with('ubuntu:latest', stream=True)
        client_mock.wait.assert_called_with('some_id')
    def test_execute_tls(self, client_class_mock, tls_class_mock):
        client_mock = mock.Mock(spec=APIClient)
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.create_host_config.return_value = mock.Mock()
        client_mock.images.return_value = []
        client_mock.attach.return_value = []
        client_mock.pull.return_value = []
        client_mock.wait.return_value = {"StatusCode": 0}

        client_class_mock.return_value = client_mock
        tls_mock = mock.Mock()
        tls_class_mock.return_value = tls_mock

        operator = DockerOperator(docker_url='tcp://127.0.0.1:2376', image='ubuntu',
                                  owner='unittest', task_id='unittest', tls_client_cert='cert.pem',
                                  tls_ca_cert='ca.pem', tls_client_key='key.pem')
        operator.execute(None)

        tls_class_mock.assert_called_with(assert_hostname=None, ca_cert='ca.pem',
                                          client_cert=('cert.pem', 'key.pem'),
                                          ssl_version=None, verify=True)

        client_class_mock.assert_called_with(base_url='https://127.0.0.1:2376',
                                             tls=tls_mock, version=None)
Example #5
0
    dag_id="atd_knack_signal_work_orders",
    default_args=default_args,
    schedule_interval="50 8 * * *",
    dagrun_timeout=timedelta(minutes=60),
    tags=["production", "knack"],
    catchup=False,
) as dag:
    # completely replace data on 15th day of every month
    # this is a failsafe catch records that may have been missed via incremental loading
    date_filter = "{{ '1970-01-01' if ds.endswith('15') else prev_execution_date_success or '1970-01-01' }}"  # noqa:E501
    t1 = DockerOperator(
        task_id="atd_knack_traffic_signal_work_orders_to_postgrest",
        image=docker_image,
        api_version="auto",
        auto_remove=True,
        command=f'./atd-knack-services/services/{script_task_1}.py -a {app_name} -c {container} -d "{date_filter}"',  # noqa:E501
        docker_url="tcp://localhost:2376",
        network_mode="bridge",
        environment=env_vars,
        tty=True,
    )

    t2 = DockerOperator(
        task_id="atd_knack_traffic_signal_work_orders_to_socrata",
        image=docker_image,
        api_version="auto",
        auto_remove=True,
        command=f'./atd-knack-services/services/{script_task_2}.py -a {app_name} -c {container} -d "{date_filter}"',  # noqa
        docker_url="tcp://localhost:2376",
        network_mode="bridge",
        environment=env_vars,
with DAG(
        f"atd_mds_process_unfinished_staging",
        default_args=default_args,
        schedule_interval="0 2 * * *",
        catchup=False,
        tags=["staging", "mds"],
) as dag:
    # Task: process_unfinished_lime
    # Description: Processes unfinished schedule blocks assigned to Lime
    lime = DockerOperator(
        task_id="process_unfinished_lime",
        image=docker_image,
        api_version="auto",
        auto_remove=True,
        command=
        f"./provider_runtool.py --provider 'lime' --time-min '{time_min}' --time-max '{time_max}' --incomplete-only --no-logs",
        docker_url="tcp://localhost:2376",
        network_mode="bridge",
        environment=environment_vars,
    )

    # Task: process_unfinished_jump
    # Description: Processes unfinished schedule blocks assigned to Jump
    jump = DockerOperator(
        task_id="process_unfinished_jump",
        image=docker_image,
        api_version="auto",
        auto_remove=True,
        command=
        f"./provider_runtool.py --provider 'jump' --time-min '{time_min}' --time-max '{time_max}' --incomplete-only --no-logs",
Example #7
0
    'dataflow_default_options': {
        'project': os.environ['GCP_PROJECT']
    }
}

dag = DAG('firearm_seizures',
          default_args=default_args,
          schedule_interval='@monthly')

gcs_load = DockerOperator(
    task_id='firearms_gcs_docker',
    image='gcr.io/data-rivers/pgh-firearms',
    api_version='auto',
    auto_remove=True,
    environment={
        'APRS_UN': os.environ['APRS_UN'],
        'APRS_PW': os.environ['APRS_PW'],
        'GCS_AUTH_FILE':
        '/root/firearm-seizures-report/data-rivers-service-acct.json',
        'GCS_PREFIX': os.environ['GCS_PREFIX']
    },
    dag=dag)

# dataflow_task = DataFlowPythonOperator(
#     task_id='firearms_dataflow',
#     job_name='firearms-dataflow',
#     py_file=os.getcwd() + '/airflow_scripts/dags/dependencies/dataflow_scripts/firearms_dataflow.py'),
#     dag=dag
# )

dataflow_task = BashOperator(
* Moves WARCs from warcprox into the right place in the /heritrix/output folders.
* **TBA** 'Closes' WARCs that are .open, if they are older than a few days.

Configuration:

* The tasks are configured to scan `/mnt/gluster/fc`.
* The push gateway is configured to be `{c.push_gateway}`.

How to check it's working:

* Task Instance logs show how many WARCs were moved.
* Prometheus updated via Push Gateway with `ukwa_files_moved_total_count{{kind='warcprox-warcs'}}` counts.
    * Look for job results in [the push gateway configured for this task](http://{c.push_gateway}).
    * For example results from Prometheus in production, see [here](http://monitor-prometheus.api.wa.bl.uk/graph?g0.expr=ukwa_files_moved_total_count{{kind='warcprox-warc'}}&g0.tab=0&g0.stacked=0&g0.range_input=4w).

"""

    tidy = DockerOperator(
        task_id='move-warcprox-warcs',
        image=c.ukwa_task_image,
        command='store -v warctidy',
        user=0,  # Run as root due to file permissions
        volumes=['/mnt/gluster/fc:/mnt/gluster/fc'],
        environment={
            'PUSH_GATEWAY': c.push_gateway,
        },
        tty=True,  # <-- So we see logging
        do_xcom_push=False,
    )
Example #9
0
default_args = {
    'owner': 'jonas.bieri',
    'description': 'Run the smarte_strasse_parking docker container',
    'depend_on_past': False,
    'start_date': datetime(2022, 1, 14),
    'email':
    ["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=15)
}

with DAG('smarte_strasse_parking',
         default_args=default_args,
         schedule_interval="*/5 * * * *",
         catchup=False) as dag:
    dag.doc_md = __doc__
    upload = DockerOperator(
        task_id='upload',
        image='smarte_strasse_parking:latest',
        api_version='auto',
        auto_remove=True,
        command=
        '/bin/bash /code/data-processing/smarte_strasse_parking/etl.sh ',
        container_name='smarte_strasse_parking',
        docker_url="unix://var/run/docker.sock",
        network_mode="bridge",
        tty=True,
        volumes=['/data/dev/workspace/data-processing:/code/data-processing'])
Example #10
0
         default_args=default_args,
         schedule_interval='*/15 * * * *',
         catchup=False) as dag:

    t1 = BashOperator(
        task_id='print_start_time',
        bash_command='echo `date "+%Y-%m-%d%H:%M:%S"` "- Airflow Task Started"'
    )

    t2 = DockerOperator(task_id='docker_command',
                        image='entechlog/weather-alert-app:latest',
                        api_version='auto',
                        auto_remove=True,
                        docker_url="unix://var/run/docker.sock",
                        network_mode="weatheralertapp_default",
                        environment={
                            'bootstrap_servers': "broker:9092",
                            'schema_registry_url':
                            "http://schema-registry:8081",
                            'topic_name': "weather.alert.app.source",
                            'lat': "8.270272",
                            'lon': "77.177274",
                            'OPEN_WEATHER_API_KEY': ""
                        })

    t3 = BashOperator(
        task_id='print_end_time',
        bash_command='echo `date "+%Y-%m-%d%H:%M:%S"` "- Airflow Task Finished"'
    )

    t1 >> t2 >> t3
}

with DAG('fix_s3_recording_url_pipeline',
         default_args=default_args,
         schedule_interval='*/10 * * * *',
         catchup=False) as dag:

    t1 = BashOperator(
        task_id='login_aws',
        bash_command=
        '$(aws ecr get-login --region eu-west-1 --no-include-email)')

    t2 = DockerOperator(
        task_id='fix_s3_recording_url_pipeline',
        auto_remove=True,
        image=IMAGE_NAME,
        api_version='auto',
        command=COMMAND,
        docker_url='unix://var/run/docker.sock',
        network_mode='host',
        environment={
            'DATABASE_HOST': DATABASE_HOST,
            'ELASTICSEARCH_URL': ELASTICSEARCH_URL,
            'DYNAMODB_HOST': DYNAMODB_HOST,
        },
        volumes=[LOG_DIRECTORY, BOTO_CREDENTIAL],
        force_pull=True,
    )

    t2.set_upstream(t1)
DAG_ID = os.path.basename(__file__).replace(".pyc", "").replace(".py", "")

default_args = {
    "owner": "airflow",
    "start_date": datetime(2020, 10, 29),
    "end_date": datetime(2020, 11, 15),
    "email": ["*****@*****.**"],
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=15)
}

refs = dict()
refs['execution_date'] = '{{ ds }}'
refs['update_status'] = 'false'

dag = DAG(dag_id=DAG_ID,
          default_args=default_args,
          schedule_interval="@daily",
          max_active_runs=1,
          concurrency=1)

task = DockerOperator(dag=dag,
                      task_id='fill_noise_tracks_gaps',
                      auto_remove=True,
                      docker_url='unix://var/run/docker.sock',
                      api_version='auto',
                      image='fill_max_slow:v1.0',
                      environment=refs)
    'start_date': datetime(2020, 8, 24),
    'email': ["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=3)
}

with DAG('tba_abfuhrtermine', default_args=default_args, schedule_interval="0 10 * * *", catchup=False) as dag:
    dag.doc_md = __doc__
    process_upload = DockerOperator(
        task_id='process-upload',
        image='tba_abfuhrtermine:latest',
        api_version='auto',
        auto_remove=True,
        command='/bin/bash /code/data-processing/tba_abfuhrtermine/etl.sh ',
        container_name='tba_abfuhrtermine',
        docker_url="unix://var/run/docker.sock",
        network_mode="bridge",
        tty=True,
        volumes=['/mnt/OGD-GVA:/code/data-processing/tba_abfuhrtermine/data_orig','/data/dev/workspace/data-processing:/code/data-processing']
    )

    ods_publish = DockerOperator(
        task_id='ods-publish',
        image='ods-publish:latest',
        api_version='auto',
        auto_remove=True,
        command='python3 -m ods_publish.etl_id 100096',
        container_name='tba-abfuhrtermine--ods-publish',
        docker_url="unix://var/run/docker.sock",
        network_mode="bridge",
Example #14
0
    description='Pipeline for scraping daily "sold" data from hemnet and \
        ingesting to deltalake on S3',
    schedule_interval='23 21 * * *'  # 21:23
)

cmd = """
    dailyspider \
    -a target='sold' \
    -s KAFKA_PRODUCER_TOPIC={{ var.value.KAFKA_TOPIC_SOLD }} \
    -s KAFKA_PRODUCER_BROKERS={{ var.value.KAFKA_BROKERS }} \
    -s REDIS_HOST={{ var.value.REDIS_HOST }}
"""

scrape_pages_to_kafka = DockerOperator(task_id='hemnet_daily_sold_spider',
                                       image=HEMNET_SPIDER_DOCKER_IMAGE,
                                       command=cmd,
                                       docker_url='unix://var/run/docker.sock',
                                       network_mode='host',
                                       dag=dag)

spark_submit_cmd_kafka_bronze = """
cd {{ var.value.ETL_HOME }}
{{ var.value.SPARK_HOME }}/spark-submit \
    --packages io.delta:delta-core_2.12:0.7.0,org.apache.hadoop:hadoop-aws:2.7.7,org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0  \
    --conf spark.delta.logStore.class=org.apache.spark.sql.delta.storage.S3SingleDriverLogStore  \
    --conf spark.hadoop.fs.s3a.endpoint={{ var.value.S3_ENDPOINT }}  \
    --conf spark.driver.extraJavaOptions=-Dcom.amazonaws.services.s3.enableV4=true  \
    --conf spark.executor.extraJavaOptions=-Dcom.amazonaws.services.s3.enableV4=true  \
    --conf spark.hadoop.fs.s3a.access.key={{ var.value.AWS_S3_ACCESS }}  \
    --conf spark.hadoop.fs.s3a.secret.key={{ var.value.AWS_S3_SECRET }} \
    --py-files=dist/jobs.zip,dist/libs.zip dist/main.py  \
    --job dailyKafkaToBronze  \
Example #15
0
        bash_command=
        'git clone https://github.com/natbusa/dlf-tutorial /usr/local/airflow/repos/tutorial'
    )

    t_git_pull = BashOperator(
        task_id='git_pull',
        bash_command='cd /usr/local/airflow/repos/tutorial && git pull',
        trigger_rule='one_success')

    t_check_repo = BranchPythonOperator(
        task_id='does_repo_exist', python_callable=checkIfRepoIsAlreadyCloned)

    t_dummy = DummyOperator(task_id='dummy')

    t_check_repo >> t_git_clone >> t_git_pull
    t_check_repo >> t_dummy >> t_git_pull

    t_docker = DockerOperator(
        task_id='docker_command',
        image='databox/pyspark-notebook:2.4.4-hadoop-3.2.1',
        api_version='auto',
        auto_remove=True,
        environment={},
        volumes=['airflow_repos:/home/jovyan/work/repos'],
        command=
        'spark-submit --master spark://spark-master:7077 /home/jovyan/work/repos/tutorial/minimal.py',
        docker_url='unix://var/run/docker.sock',
        network_mode='databox')

    t_git_pull >> t_docker
Example #16
0
env_vars = Variable.get("atd_knack_services_postgrest", deserialize_json=True)
# unpack knack auth
atd_knack_auth = Variable.get("atd_knack_auth", deserialize_json=True)
env_vars["KNACK_APP_ID"] = atd_knack_auth[app_name][env]["app_id"]

with DAG(
        dag_id="atd_knack_metadata_data_tracker_to_postgrest",
        default_args=default_args,
        schedule_interval="55 5 * * *",
        dagrun_timeout=timedelta(minutes=60),
        tags=["production", "knack"],
        catchup=False,
) as dag:

    t1 = DockerOperator(
        task_id="atd_knack_metadata_data_tracker_to_postgrest",
        image=docker_image,
        api_version="auto",
        auto_remove=True,
        command=f"./atd-knack-services/services/{script}.py",  # noqa
        docker_url="tcp://localhost:2376",
        network_mode="bridge",
        environment=env_vars,
        tty=True,
    )

    t1

if __name__ == "__main__":
    dag.cli()
Example #17
0
for i, assignment in enumerate(results):
    prepare = DockerOperator(
        container_name=f"greenbrier-{assignment['assignment_id']}-prepare-task",
        task_id=f"greenbrier-{assignment['assignment_id']}-prepare-task",
        image="wildflowerschools/wf-deep-docker:video-prepare-tooling-v30",
        command=[
            "python",
            "-m",
            "inference_helpers",
            "prepare-assignment-videos",
            "--environment_name",
            "greenbrier",
            "--start",
            timestamp_pattern,
            "--duration",
            DURATION,
            "--assignment",
            assignment['assignment_id'],
            "--device",
            assignment['device_id'],
        ],
        execution_timeout=timedelta(hours=2),
        force_pull=False,
        environment=prepare_env,
        volumes=["/data:/data"],
        dag=dag,
        docker_url='unix://var/run/docker.sock',
        network_mode='host',
        api_version='auto',
        auto_remove=True,
    )
    previous = prepare
Example #18
0
default_args = {
    'owner': 'jonas.bieri',
    'description': 'Run the aue_schall docker container',
    'depend_on_past': False,
    'start_date': datetime(2020, 6, 24),
    'email':
    ["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=15)
}

with DAG('aue_schall',
         default_args=default_args,
         schedule_interval="*/15 * * * *",
         catchup=False) as dag:
    dag.doc_md = __doc__
    upload = DockerOperator(
        task_id='upload',
        image='aue_schall:latest',
        api_version='auto',
        auto_remove=True,
        command='python3 -m aue_schall.etl',
        container_name='aue_schall',
        docker_url="unix://var/run/docker.sock",
        network_mode="bridge",
        tty=True,
        volumes=['/data/dev/workspace/data-processing:/code/data-processing'])
Example #19
0
fda_linker_task = SubDagOperator(
    dag=dag,
    subdag=fda_dap(parent_dag_name='fda',
                   child_dag_name='linker',
                   start_date=dag.start_date,
                   schedule_interval=dag.schedule_interval),
    task_id='linker',
)

remove_unknown_documentcloud_docs_task = DockerOperator(
    task_id='remove_unknown_documentcloud_docs',
    dag=dag,
    image='opentrials/processors:latest',
    force_pull=True,
    api_version='1.23',
    environment={
        'WAREHOUSE_URL': helpers.get_postgres_uri('warehouse_db'),
        'DATABASE_URL': helpers.get_postgres_uri('api_db'),
        'EXPLORERDB_URL': helpers.get_postgres_uri('explorer_db'),
        'LOGGING_URL': Variable.get('LOGGING_URL'),
        'DOCUMENTCLOUD_USERNAME': Variable.get('DOCUMENTCLOUD_USERNAME'),
        'DOCUMENTCLOUD_PASSWORD': Variable.get('DOCUMENTCLOUD_PASSWORD'),
        'DOCUMENTCLOUD_PROJECT': Variable.get('DOCUMENTCLOUD_PROJECT'),
        'FERNET_KEY': os.environ['FERNET_KEY'],
    },
    command='make start remove_unknown_documentcloud_docs')

remove_unknown_documentcloud_docs_task.set_upstream(fda_linker_task)
fda_linker_task.set_upstream(fda_dap_task)
Example #20
0
docker_image = "atddocker/atd-metrobike:production"

# assemble env vars
env_vars = Variable.get("atd_metrobike", deserialize_json=True)

with DAG(
    dag_id="atd_metrobike_trips",
    default_args=default_args,
    schedule_interval="33 1 * * 1",  # runs weekly at 1:33am Monday
    dagrun_timeout=timedelta(minutes=60),
    tags=["production", "metrobike"],
    catchup=False,
) as dag:
    t1 = DockerOperator(
        task_id="atd_metrobike_trips_socrata",
        image=docker_image,
        api_version="auto",
        auto_remove=True,
        command="python publish_trips.py",
        docker_url="tcp://localhost:2376",
        network_mode="bridge",
        environment=env_vars,
        tty=True,
    )

    t1

if __name__ == "__main__":
    dag.cli()
env_vars["SOCRATA_API_KEY_ID"] = Variable.get("atd_service_bot_socrata_api_key_id")
env_vars["SOCRATA_API_KEY_SECRET"] = Variable.get(
    "atd_service_bot_socrata_api_key_secret"
)
env_vars["SOCRATA_APP_TOKEN"] = Variable.get("atd_service_bot_socrata_app_token")

with DAG(
    dag_id="atd_kits_sig_stat_pub",
    default_args=default_args,
    schedule_interval="*/5 * * * *",
    dagrun_timeout=timedelta(minutes=60),
    tags=["production", "socrata", "kits"],
    catchup=False,
) as dag:
    t1 = DockerOperator(
        task_id="atd_kits_sig_status_to_socrata",
        image=docker_image,
        api_version="auto",
        auto_remove=True,
        command="./atd-kits/atd-kits/signal_status_publisher.py",
        docker_url="tcp://localhost:2376",
        network_mode="bridge",
        environment=env_vars,
        tty=True,
    )

    t1

if __name__ == "__main__":
    dag.cli()
default_args = {
    'owner': 'airflow',
    'description': 'Use of the DockerOperator',
    'depend_on_past': False,
    'start_date': datetime(2018, 1, 3),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

with DAG('DockerTutorial',
         default_args=default_args,
         schedule_interval="5 * * * *",
         catchup=False) as dag:
    t1 = BashOperator(task_id='print_current_date', bash_command='date')

    t2 = DockerOperator(
        task_id='docker_command',
        image='centos:latest',
        api_version='auto',
        auto_remove=True,
        command="/bin/sleep 30",
        docker_url="unix://var/run/docker.sock",
        network_mode="bridge",
        host_tmp_dir=None if not sys.platform == "darwin" else "/tmp/")

    t3 = BashOperator(task_id='print_hello', bash_command='echo "hello world"')

    t1 >> t2 >> t3
Example #23
0
    'on_failure_callback': send_alert_task_failure_to_slack
}

with DAG('sync_country_from_zendesk_pipeline',
         default_args=default_args,
         schedule_interval="0 0 * * *",
         catchup=False) as dag:

    t1 = BashOperator(
        task_id='login_aws',
        bash_command=
        '$(aws ecr get-login --region eu-west-1 --no-include-email)')

    t2 = DockerOperator(
        task_id='sync_country_from_zendesk_pipeline',
        auto_remove=True,
        image=IMAGE_NAME,
        api_version='auto',
        command=COMMAND,
        docker_url='unix://var/run/docker.sock',
        network_mode='host',
        environment={
            'DATABASE_HOST': DATABASE_HOST,
            'ELASTICSEARCH_URL': ELASTICSEARCH_URL,
            'DYNAMODB_HOST': DYNAMODB_HOST
        },
        volumes=[LOG_DIRECTORY],
        force_pull=True,
    )

    t2.set_upstream(t1)
Example #24
0
    'start_date': datetime(2020, 1, 3),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

currentDirectory = os.getcwd()
with DAG('spark_pipeline_file',
         default_args=default_args,
         schedule_interval=None,
         catchup=False) as dag:
    t1 = BashOperator(task_id='Start_of_Dag', bash_command='date')

    t2 = DockerOperator(
        task_id='wordcount_file',
        image='jupyter/all-spark-notebook',
        api_version='auto',
        auto_remove=True,
        network_mode="bridge",
        docker_url="unix://private/var/run/docker.sock",
        host_tmp_dir='/tmp',
        tmp_dir='/tmp',
        volumes=[f'{currentDirectory}/pyspark:/home/jovyan'],
        command='spark-submit --master local[*] script/hellospark.py')

    t3 = BashOperator(
        task_id='End_of_Dag',
        bash_command='echo "Bye Bye. check output folder for the results"')

    t1 >> t2 >> t3
Example #25
0
    'retries': 5,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG(
    'ByName',
    default_args=default_args,
    description='Filter by name dag',
    schedule_interval='@daily',
)

t1 = DockerOperator(task_id='DockerOperator',
                    image='faizan-k_devchallenge',
                    api_version='auto',
                    auto_remove=True,
                    command='byname -n "{}"'.format(beer_name),
                    docker_url="unix://var/run/docker.sock",
                    network_mode="bridge",
                    xcom_push=True,
                    dag=dag)


def perform_calculation(**context):
    output = json.loads(context['ti'].xcom_pull(task_ids='DockerOperator'))
    avg_ibu_ibv = json.dumps({
        "avg_ibu":
        sum([i['ibu'] for i in output]) / (len(output) or 1),
        "avg_abv":
        sum([i['abv'] for i in output]) / (len(output) or 1)
    })
    context['ti'].xcom_push(key="AVG_IBU_ABV", value=avg_ibu_ibv)
    'retry_delay': timedelta(minutes=15)
}

with DAG('bag_coronavirus',
         default_args=default_args,
         schedule_interval="15 * * * *",
         catchup=False) as dag:
    dag.doc_md = __doc__
    upload_bag_datasets = DockerOperator(
        task_id='upload_bag_datasets',
        image='bag_coronavirus:latest',
        api_version='auto',
        auto_remove=True,
        command=
        '/bin/bash /code/data-processing/bag_coronavirus/etl_bag_datasets.sh ',
        container_name='bag_coronavirus--upload_bag_datasets',
        docker_url="unix://var/run/docker.sock",
        network_mode="bridge",
        tty=True,
        volumes=[
            '/data/dev/workspace/data-processing:/code/data-processing',
            '/mnt/OGD-DataExch/StatA/BAG_Coronavirus_Tests:/code/data-processing/bag_coronavirus/data'
        ])

    upload_vmdl = DockerOperator(
        task_id='upload_vmdl',
        image='bag_coronavirus:latest',
        api_version='auto',
        auto_remove=True,
        command='/bin/bash /code/data-processing/bag_coronavirus/etl_vmdl.sh ',
        container_name='bag_coronavirus--upload_vmdl',
        f"atd_mds_{mds_provider}_staging",
        default_args=default_args,
        schedule_interval="15 * * * *",
        catchup=False,
        tags=["staging", "mds"],
) as dag:
    #
    # Task: provider_extract
    # Description: Given a schedule block, the script extracts data from the MDS provider within the schedule's time window
    # then it uploads the data into S3 for further processing.
    #
    t1 = DockerOperator(
        task_id='provider_extract',
        image=docker_image,
        api_version='auto',
        auto_remove=True,
        command=
        f"./provider_extract.py --provider '{mds_provider}' --time-max '{time_max}' --interval 1",
        docker_url="tcp://localhost:2376",
        network_mode="bridge",
        environment=environment_vars)

    #
    # Task: provider_sync_db
    # Description: Downloads the extracted MDS data from S3, and inserts each trip into a postgres database.
    #
    t2 = DockerOperator(
        task_id='provider_sync_db',
        image=docker_image,
        api_version='auto',
        auto_remove=True,
        command=
Example #28
0
from airflow.operators.docker_operator import DockerOperator

default_args = {
    'owner': 'airflowMAT',
    'description': 'Use of the DockerOperator',
    'depend_on_past': False,
    'start_date': datetime(2018, 1, 3),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

with DAG('docker_dag',
         default_args=default_args,
         schedule_interval="5 * * * *",
         catchup=False) as dag:
    t1 = BashOperator(task_id='print_current_date', bash_command='date')

    t2 = DockerOperator(task_id='docker_command',
                        image='openjdk:latest',
                        api_version='auto',
                        auto_remove=True,
                        command="/bin/sleep 30",
                        docker_url="unix://var/run/docker.sock",
                        network_mode="bridge")

    t3 = BashOperator(task_id='print_hello', bash_command='echo "hello world"')

    t1 >> t2 >> t3
Example #29
0
        default_args=default_args,
        schedule_interval="0 8 * * *",
        catchup=False,
        tags=["production", "visionzero"],
) as dag:
        #
        # Task: docker_command_crashes
        # Description: Imports a raw CSV file with crash records into our database via GraphSQL/Hasura.
        #
        crash = DockerOperator(
                task_id='docker_command_crashes',
                image='atddocker/atd-vz-etl:production',
                api_version='auto',
                auto_remove=True,
                command="/app/process_hasura_import.py crash",
                docker_url="tcp://localhost:2376",
                network_mode="bridge",
                environment=atd_visionzero_cris_envvars,
                volumes=[
                        atd_visionzero_cris_volumes["ATD_VOLUME_DATA"],
                        atd_visionzero_cris_volumes["ATD_VOLUME_TEMP"],
                ],
        )

        #
        # Task: docker_command_unit
        # Description: Imports a raw CSV file with unit records into our database via GraphSQL/Hasura.
        #
        unit = DockerOperator(
                task_id='docker_command_unit',
                image='atddocker/atd-vz-etl:production',
                api_version='auto',
Example #30
0
    'depends_on_past': False,
    'start_date': datetime(2019, 5, 28),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('google_timeline_ingestion', description='daily ingestion for google timeline data',
          default_args=default_args, schedule_interval='0 3 * * *', catchup=False)

t_1 = DockerOperator(
    task_id='initialize_raw_staging',
    image='tracker-task1',
    api_version='auto',
    auto_remove=True,
    command="python /usr/timeline_tracker/tasks/task_initialize_raw_staging.py",
    docker_url="unix://var/run/docker.sock",
    network_mode="host",
    dag=dag
)

t_2 = DockerOperator(
    task_id='extract_raw_to_staging',
    image='tracker-task1',
    api_version='auto',
    auto_remove=True,
    command="python /usr/timeline_tracker/tasks/task_extract_raw_to_staging.py",
    docker_url="unix://var/run/docker.sock",
    network_mode="host",
    dag=dag
)