def test_execute_unicode_logs(self, client_class_mock): client_mock = mock.Mock(spec=APIClient) client_mock.create_container.return_value = {'Id': 'some_id'} client_mock.create_host_config.return_value = mock.Mock() client_mock.images.return_value = [] client_mock.attach.return_value = ['unicode container log 😁'] client_mock.pull.return_value = [] client_mock.wait.return_value = {"StatusCode": 0} client_class_mock.return_value = client_mock originalRaiseExceptions = logging.raiseExceptions # pylint: disable=invalid-name logging.raiseExceptions = True operator = DockerOperator(image='ubuntu', owner='unittest', task_id='unittest') with mock.patch('traceback.print_exception') as print_exception_mock: operator.execute(None) logging.raiseExceptions = originalRaiseExceptions print_exception_mock.assert_not_called()
def test_execute_with_docker_conn_id_use_hook(self, operator_client_mock, operator_docker_hook): # Mock out a Docker client, so operations don't raise errors client_mock = mock.Mock(name='DockerOperator.APIClient mock', spec=APIClient) client_mock.images.return_value = [] client_mock.create_container.return_value = {'Id': 'some_id'} client_mock.attach.return_value = [] client_mock.pull.return_value = [] client_mock.wait.return_value = {"StatusCode": 0} operator_client_mock.return_value = client_mock # Create the DockerOperator operator = DockerOperator( image='publicregistry/someimage', owner='unittest', task_id='unittest', docker_conn_id='some_conn_id' ) # Mock out the DockerHook hook_mock = mock.Mock(name='DockerHook mock', spec=DockerHook) hook_mock.get_conn.return_value = client_mock operator_docker_hook.return_value = hook_mock operator.execute(None) self.assertEqual( operator_client_mock.call_count, 0, 'Client was called on the operator instead of the hook' ) self.assertEqual( operator_docker_hook.call_count, 1, 'Hook was not called although docker_conn_id configured' ) self.assertEqual( client_mock.pull.call_count, 1, 'Image was not pulled using operator client' )
def test_execute(self, client_class_mock, mkdtemp_mock): host_config = mock.Mock() mkdtemp_mock.return_value = '/mkdtemp' client_mock = mock.Mock(spec=Client) client_mock.create_container.return_value = {'Id': 'some_id'} client_mock.create_host_config.return_value = host_config client_mock.images.return_value = [] client_mock.logs.return_value = ['container log'] client_mock.pull.return_value = [b'{"status":"pull log"}'] client_mock.wait.return_value = 0 client_class_mock.return_value = client_mock operator = DockerOperator(api_version='1.19', command='env', environment={'UNIT': 'TEST'}, image='ubuntu:latest', network_mode='bridge', owner='unittest', task_id='unittest', volumes=['/host/path:/container/path']) operator.execute(None) client_class_mock.assert_called_with(base_url='unix://var/run/docker.sock', tls=None, version='1.19') client_mock.create_container.assert_called_with(command='env', cpu_shares=1024, environment={ 'AIRFLOW_TMP_DIR': '/tmp/airflow', 'UNIT': 'TEST' }, host_config=host_config, image='ubuntu:latest', mem_limit=None, user=None) client_mock.create_host_config.assert_called_with(binds=['/host/path:/container/path', '/mkdtemp:/tmp/airflow'], network_mode='bridge') client_mock.images.assert_called_with(name='ubuntu:latest') client_mock.logs.assert_called_with(container='some_id', stream=True) client_mock.pull.assert_called_with('ubuntu:latest', stream=True) client_mock.wait.assert_called_with('some_id')
def test_execute_tls(self, client_class_mock, tls_class_mock): client_mock = mock.Mock(spec=APIClient) client_mock.create_container.return_value = {'Id': 'some_id'} client_mock.create_host_config.return_value = mock.Mock() client_mock.images.return_value = [] client_mock.attach.return_value = [] client_mock.pull.return_value = [] client_mock.wait.return_value = {"StatusCode": 0} client_class_mock.return_value = client_mock tls_mock = mock.Mock() tls_class_mock.return_value = tls_mock operator = DockerOperator(docker_url='tcp://127.0.0.1:2376', image='ubuntu', owner='unittest', task_id='unittest', tls_client_cert='cert.pem', tls_ca_cert='ca.pem', tls_client_key='key.pem') operator.execute(None) tls_class_mock.assert_called_with(assert_hostname=None, ca_cert='ca.pem', client_cert=('cert.pem', 'key.pem'), ssl_version=None, verify=True) client_class_mock.assert_called_with(base_url='https://127.0.0.1:2376', tls=tls_mock, version=None)
dag_id="atd_knack_signal_work_orders", default_args=default_args, schedule_interval="50 8 * * *", dagrun_timeout=timedelta(minutes=60), tags=["production", "knack"], catchup=False, ) as dag: # completely replace data on 15th day of every month # this is a failsafe catch records that may have been missed via incremental loading date_filter = "{{ '1970-01-01' if ds.endswith('15') else prev_execution_date_success or '1970-01-01' }}" # noqa:E501 t1 = DockerOperator( task_id="atd_knack_traffic_signal_work_orders_to_postgrest", image=docker_image, api_version="auto", auto_remove=True, command=f'./atd-knack-services/services/{script_task_1}.py -a {app_name} -c {container} -d "{date_filter}"', # noqa:E501 docker_url="tcp://localhost:2376", network_mode="bridge", environment=env_vars, tty=True, ) t2 = DockerOperator( task_id="atd_knack_traffic_signal_work_orders_to_socrata", image=docker_image, api_version="auto", auto_remove=True, command=f'./atd-knack-services/services/{script_task_2}.py -a {app_name} -c {container} -d "{date_filter}"', # noqa docker_url="tcp://localhost:2376", network_mode="bridge", environment=env_vars,
with DAG( f"atd_mds_process_unfinished_staging", default_args=default_args, schedule_interval="0 2 * * *", catchup=False, tags=["staging", "mds"], ) as dag: # Task: process_unfinished_lime # Description: Processes unfinished schedule blocks assigned to Lime lime = DockerOperator( task_id="process_unfinished_lime", image=docker_image, api_version="auto", auto_remove=True, command= f"./provider_runtool.py --provider 'lime' --time-min '{time_min}' --time-max '{time_max}' --incomplete-only --no-logs", docker_url="tcp://localhost:2376", network_mode="bridge", environment=environment_vars, ) # Task: process_unfinished_jump # Description: Processes unfinished schedule blocks assigned to Jump jump = DockerOperator( task_id="process_unfinished_jump", image=docker_image, api_version="auto", auto_remove=True, command= f"./provider_runtool.py --provider 'jump' --time-min '{time_min}' --time-max '{time_max}' --incomplete-only --no-logs",
'dataflow_default_options': { 'project': os.environ['GCP_PROJECT'] } } dag = DAG('firearm_seizures', default_args=default_args, schedule_interval='@monthly') gcs_load = DockerOperator( task_id='firearms_gcs_docker', image='gcr.io/data-rivers/pgh-firearms', api_version='auto', auto_remove=True, environment={ 'APRS_UN': os.environ['APRS_UN'], 'APRS_PW': os.environ['APRS_PW'], 'GCS_AUTH_FILE': '/root/firearm-seizures-report/data-rivers-service-acct.json', 'GCS_PREFIX': os.environ['GCS_PREFIX'] }, dag=dag) # dataflow_task = DataFlowPythonOperator( # task_id='firearms_dataflow', # job_name='firearms-dataflow', # py_file=os.getcwd() + '/airflow_scripts/dags/dependencies/dataflow_scripts/firearms_dataflow.py'), # dag=dag # ) dataflow_task = BashOperator(
* Moves WARCs from warcprox into the right place in the /heritrix/output folders. * **TBA** 'Closes' WARCs that are .open, if they are older than a few days. Configuration: * The tasks are configured to scan `/mnt/gluster/fc`. * The push gateway is configured to be `{c.push_gateway}`. How to check it's working: * Task Instance logs show how many WARCs were moved. * Prometheus updated via Push Gateway with `ukwa_files_moved_total_count{{kind='warcprox-warcs'}}` counts. * Look for job results in [the push gateway configured for this task](http://{c.push_gateway}). * For example results from Prometheus in production, see [here](http://monitor-prometheus.api.wa.bl.uk/graph?g0.expr=ukwa_files_moved_total_count{{kind='warcprox-warc'}}&g0.tab=0&g0.stacked=0&g0.range_input=4w). """ tidy = DockerOperator( task_id='move-warcprox-warcs', image=c.ukwa_task_image, command='store -v warctidy', user=0, # Run as root due to file permissions volumes=['/mnt/gluster/fc:/mnt/gluster/fc'], environment={ 'PUSH_GATEWAY': c.push_gateway, }, tty=True, # <-- So we see logging do_xcom_push=False, )
default_args = { 'owner': 'jonas.bieri', 'description': 'Run the smarte_strasse_parking docker container', 'depend_on_past': False, 'start_date': datetime(2022, 1, 14), 'email': ["*****@*****.**", "*****@*****.**", "*****@*****.**"], 'email_on_failure': True, 'email_on_retry': False, 'retries': 0, 'retry_delay': timedelta(minutes=15) } with DAG('smarte_strasse_parking', default_args=default_args, schedule_interval="*/5 * * * *", catchup=False) as dag: dag.doc_md = __doc__ upload = DockerOperator( task_id='upload', image='smarte_strasse_parking:latest', api_version='auto', auto_remove=True, command= '/bin/bash /code/data-processing/smarte_strasse_parking/etl.sh ', container_name='smarte_strasse_parking', docker_url="unix://var/run/docker.sock", network_mode="bridge", tty=True, volumes=['/data/dev/workspace/data-processing:/code/data-processing'])
default_args=default_args, schedule_interval='*/15 * * * *', catchup=False) as dag: t1 = BashOperator( task_id='print_start_time', bash_command='echo `date "+%Y-%m-%d%H:%M:%S"` "- Airflow Task Started"' ) t2 = DockerOperator(task_id='docker_command', image='entechlog/weather-alert-app:latest', api_version='auto', auto_remove=True, docker_url="unix://var/run/docker.sock", network_mode="weatheralertapp_default", environment={ 'bootstrap_servers': "broker:9092", 'schema_registry_url': "http://schema-registry:8081", 'topic_name': "weather.alert.app.source", 'lat': "8.270272", 'lon': "77.177274", 'OPEN_WEATHER_API_KEY': "" }) t3 = BashOperator( task_id='print_end_time', bash_command='echo `date "+%Y-%m-%d%H:%M:%S"` "- Airflow Task Finished"' ) t1 >> t2 >> t3
} with DAG('fix_s3_recording_url_pipeline', default_args=default_args, schedule_interval='*/10 * * * *', catchup=False) as dag: t1 = BashOperator( task_id='login_aws', bash_command= '$(aws ecr get-login --region eu-west-1 --no-include-email)') t2 = DockerOperator( task_id='fix_s3_recording_url_pipeline', auto_remove=True, image=IMAGE_NAME, api_version='auto', command=COMMAND, docker_url='unix://var/run/docker.sock', network_mode='host', environment={ 'DATABASE_HOST': DATABASE_HOST, 'ELASTICSEARCH_URL': ELASTICSEARCH_URL, 'DYNAMODB_HOST': DYNAMODB_HOST, }, volumes=[LOG_DIRECTORY, BOTO_CREDENTIAL], force_pull=True, ) t2.set_upstream(t1)
DAG_ID = os.path.basename(__file__).replace(".pyc", "").replace(".py", "") default_args = { "owner": "airflow", "start_date": datetime(2020, 10, 29), "end_date": datetime(2020, 11, 15), "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=15) } refs = dict() refs['execution_date'] = '{{ ds }}' refs['update_status'] = 'false' dag = DAG(dag_id=DAG_ID, default_args=default_args, schedule_interval="@daily", max_active_runs=1, concurrency=1) task = DockerOperator(dag=dag, task_id='fill_noise_tracks_gaps', auto_remove=True, docker_url='unix://var/run/docker.sock', api_version='auto', image='fill_max_slow:v1.0', environment=refs)
'start_date': datetime(2020, 8, 24), 'email': ["*****@*****.**", "*****@*****.**", "*****@*****.**"], 'email_on_failure': True, 'email_on_retry': False, 'retries': 0, 'retry_delay': timedelta(minutes=3) } with DAG('tba_abfuhrtermine', default_args=default_args, schedule_interval="0 10 * * *", catchup=False) as dag: dag.doc_md = __doc__ process_upload = DockerOperator( task_id='process-upload', image='tba_abfuhrtermine:latest', api_version='auto', auto_remove=True, command='/bin/bash /code/data-processing/tba_abfuhrtermine/etl.sh ', container_name='tba_abfuhrtermine', docker_url="unix://var/run/docker.sock", network_mode="bridge", tty=True, volumes=['/mnt/OGD-GVA:/code/data-processing/tba_abfuhrtermine/data_orig','/data/dev/workspace/data-processing:/code/data-processing'] ) ods_publish = DockerOperator( task_id='ods-publish', image='ods-publish:latest', api_version='auto', auto_remove=True, command='python3 -m ods_publish.etl_id 100096', container_name='tba-abfuhrtermine--ods-publish', docker_url="unix://var/run/docker.sock", network_mode="bridge",
description='Pipeline for scraping daily "sold" data from hemnet and \ ingesting to deltalake on S3', schedule_interval='23 21 * * *' # 21:23 ) cmd = """ dailyspider \ -a target='sold' \ -s KAFKA_PRODUCER_TOPIC={{ var.value.KAFKA_TOPIC_SOLD }} \ -s KAFKA_PRODUCER_BROKERS={{ var.value.KAFKA_BROKERS }} \ -s REDIS_HOST={{ var.value.REDIS_HOST }} """ scrape_pages_to_kafka = DockerOperator(task_id='hemnet_daily_sold_spider', image=HEMNET_SPIDER_DOCKER_IMAGE, command=cmd, docker_url='unix://var/run/docker.sock', network_mode='host', dag=dag) spark_submit_cmd_kafka_bronze = """ cd {{ var.value.ETL_HOME }} {{ var.value.SPARK_HOME }}/spark-submit \ --packages io.delta:delta-core_2.12:0.7.0,org.apache.hadoop:hadoop-aws:2.7.7,org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0 \ --conf spark.delta.logStore.class=org.apache.spark.sql.delta.storage.S3SingleDriverLogStore \ --conf spark.hadoop.fs.s3a.endpoint={{ var.value.S3_ENDPOINT }} \ --conf spark.driver.extraJavaOptions=-Dcom.amazonaws.services.s3.enableV4=true \ --conf spark.executor.extraJavaOptions=-Dcom.amazonaws.services.s3.enableV4=true \ --conf spark.hadoop.fs.s3a.access.key={{ var.value.AWS_S3_ACCESS }} \ --conf spark.hadoop.fs.s3a.secret.key={{ var.value.AWS_S3_SECRET }} \ --py-files=dist/jobs.zip,dist/libs.zip dist/main.py \ --job dailyKafkaToBronze \
bash_command= 'git clone https://github.com/natbusa/dlf-tutorial /usr/local/airflow/repos/tutorial' ) t_git_pull = BashOperator( task_id='git_pull', bash_command='cd /usr/local/airflow/repos/tutorial && git pull', trigger_rule='one_success') t_check_repo = BranchPythonOperator( task_id='does_repo_exist', python_callable=checkIfRepoIsAlreadyCloned) t_dummy = DummyOperator(task_id='dummy') t_check_repo >> t_git_clone >> t_git_pull t_check_repo >> t_dummy >> t_git_pull t_docker = DockerOperator( task_id='docker_command', image='databox/pyspark-notebook:2.4.4-hadoop-3.2.1', api_version='auto', auto_remove=True, environment={}, volumes=['airflow_repos:/home/jovyan/work/repos'], command= 'spark-submit --master spark://spark-master:7077 /home/jovyan/work/repos/tutorial/minimal.py', docker_url='unix://var/run/docker.sock', network_mode='databox') t_git_pull >> t_docker
env_vars = Variable.get("atd_knack_services_postgrest", deserialize_json=True) # unpack knack auth atd_knack_auth = Variable.get("atd_knack_auth", deserialize_json=True) env_vars["KNACK_APP_ID"] = atd_knack_auth[app_name][env]["app_id"] with DAG( dag_id="atd_knack_metadata_data_tracker_to_postgrest", default_args=default_args, schedule_interval="55 5 * * *", dagrun_timeout=timedelta(minutes=60), tags=["production", "knack"], catchup=False, ) as dag: t1 = DockerOperator( task_id="atd_knack_metadata_data_tracker_to_postgrest", image=docker_image, api_version="auto", auto_remove=True, command=f"./atd-knack-services/services/{script}.py", # noqa docker_url="tcp://localhost:2376", network_mode="bridge", environment=env_vars, tty=True, ) t1 if __name__ == "__main__": dag.cli()
for i, assignment in enumerate(results): prepare = DockerOperator( container_name=f"greenbrier-{assignment['assignment_id']}-prepare-task", task_id=f"greenbrier-{assignment['assignment_id']}-prepare-task", image="wildflowerschools/wf-deep-docker:video-prepare-tooling-v30", command=[ "python", "-m", "inference_helpers", "prepare-assignment-videos", "--environment_name", "greenbrier", "--start", timestamp_pattern, "--duration", DURATION, "--assignment", assignment['assignment_id'], "--device", assignment['device_id'], ], execution_timeout=timedelta(hours=2), force_pull=False, environment=prepare_env, volumes=["/data:/data"], dag=dag, docker_url='unix://var/run/docker.sock', network_mode='host', api_version='auto', auto_remove=True, ) previous = prepare
default_args = { 'owner': 'jonas.bieri', 'description': 'Run the aue_schall docker container', 'depend_on_past': False, 'start_date': datetime(2020, 6, 24), 'email': ["*****@*****.**", "*****@*****.**", "*****@*****.**"], 'email_on_failure': True, 'email_on_retry': False, 'retries': 0, 'retry_delay': timedelta(minutes=15) } with DAG('aue_schall', default_args=default_args, schedule_interval="*/15 * * * *", catchup=False) as dag: dag.doc_md = __doc__ upload = DockerOperator( task_id='upload', image='aue_schall:latest', api_version='auto', auto_remove=True, command='python3 -m aue_schall.etl', container_name='aue_schall', docker_url="unix://var/run/docker.sock", network_mode="bridge", tty=True, volumes=['/data/dev/workspace/data-processing:/code/data-processing'])
fda_linker_task = SubDagOperator( dag=dag, subdag=fda_dap(parent_dag_name='fda', child_dag_name='linker', start_date=dag.start_date, schedule_interval=dag.schedule_interval), task_id='linker', ) remove_unknown_documentcloud_docs_task = DockerOperator( task_id='remove_unknown_documentcloud_docs', dag=dag, image='opentrials/processors:latest', force_pull=True, api_version='1.23', environment={ 'WAREHOUSE_URL': helpers.get_postgres_uri('warehouse_db'), 'DATABASE_URL': helpers.get_postgres_uri('api_db'), 'EXPLORERDB_URL': helpers.get_postgres_uri('explorer_db'), 'LOGGING_URL': Variable.get('LOGGING_URL'), 'DOCUMENTCLOUD_USERNAME': Variable.get('DOCUMENTCLOUD_USERNAME'), 'DOCUMENTCLOUD_PASSWORD': Variable.get('DOCUMENTCLOUD_PASSWORD'), 'DOCUMENTCLOUD_PROJECT': Variable.get('DOCUMENTCLOUD_PROJECT'), 'FERNET_KEY': os.environ['FERNET_KEY'], }, command='make start remove_unknown_documentcloud_docs') remove_unknown_documentcloud_docs_task.set_upstream(fda_linker_task) fda_linker_task.set_upstream(fda_dap_task)
docker_image = "atddocker/atd-metrobike:production" # assemble env vars env_vars = Variable.get("atd_metrobike", deserialize_json=True) with DAG( dag_id="atd_metrobike_trips", default_args=default_args, schedule_interval="33 1 * * 1", # runs weekly at 1:33am Monday dagrun_timeout=timedelta(minutes=60), tags=["production", "metrobike"], catchup=False, ) as dag: t1 = DockerOperator( task_id="atd_metrobike_trips_socrata", image=docker_image, api_version="auto", auto_remove=True, command="python publish_trips.py", docker_url="tcp://localhost:2376", network_mode="bridge", environment=env_vars, tty=True, ) t1 if __name__ == "__main__": dag.cli()
env_vars["SOCRATA_API_KEY_ID"] = Variable.get("atd_service_bot_socrata_api_key_id") env_vars["SOCRATA_API_KEY_SECRET"] = Variable.get( "atd_service_bot_socrata_api_key_secret" ) env_vars["SOCRATA_APP_TOKEN"] = Variable.get("atd_service_bot_socrata_app_token") with DAG( dag_id="atd_kits_sig_stat_pub", default_args=default_args, schedule_interval="*/5 * * * *", dagrun_timeout=timedelta(minutes=60), tags=["production", "socrata", "kits"], catchup=False, ) as dag: t1 = DockerOperator( task_id="atd_kits_sig_status_to_socrata", image=docker_image, api_version="auto", auto_remove=True, command="./atd-kits/atd-kits/signal_status_publisher.py", docker_url="tcp://localhost:2376", network_mode="bridge", environment=env_vars, tty=True, ) t1 if __name__ == "__main__": dag.cli()
default_args = { 'owner': 'airflow', 'description': 'Use of the DockerOperator', 'depend_on_past': False, 'start_date': datetime(2018, 1, 3), 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } with DAG('DockerTutorial', default_args=default_args, schedule_interval="5 * * * *", catchup=False) as dag: t1 = BashOperator(task_id='print_current_date', bash_command='date') t2 = DockerOperator( task_id='docker_command', image='centos:latest', api_version='auto', auto_remove=True, command="/bin/sleep 30", docker_url="unix://var/run/docker.sock", network_mode="bridge", host_tmp_dir=None if not sys.platform == "darwin" else "/tmp/") t3 = BashOperator(task_id='print_hello', bash_command='echo "hello world"') t1 >> t2 >> t3
'on_failure_callback': send_alert_task_failure_to_slack } with DAG('sync_country_from_zendesk_pipeline', default_args=default_args, schedule_interval="0 0 * * *", catchup=False) as dag: t1 = BashOperator( task_id='login_aws', bash_command= '$(aws ecr get-login --region eu-west-1 --no-include-email)') t2 = DockerOperator( task_id='sync_country_from_zendesk_pipeline', auto_remove=True, image=IMAGE_NAME, api_version='auto', command=COMMAND, docker_url='unix://var/run/docker.sock', network_mode='host', environment={ 'DATABASE_HOST': DATABASE_HOST, 'ELASTICSEARCH_URL': ELASTICSEARCH_URL, 'DYNAMODB_HOST': DYNAMODB_HOST }, volumes=[LOG_DIRECTORY], force_pull=True, ) t2.set_upstream(t1)
'start_date': datetime(2020, 1, 3), 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } currentDirectory = os.getcwd() with DAG('spark_pipeline_file', default_args=default_args, schedule_interval=None, catchup=False) as dag: t1 = BashOperator(task_id='Start_of_Dag', bash_command='date') t2 = DockerOperator( task_id='wordcount_file', image='jupyter/all-spark-notebook', api_version='auto', auto_remove=True, network_mode="bridge", docker_url="unix://private/var/run/docker.sock", host_tmp_dir='/tmp', tmp_dir='/tmp', volumes=[f'{currentDirectory}/pyspark:/home/jovyan'], command='spark-submit --master local[*] script/hellospark.py') t3 = BashOperator( task_id='End_of_Dag', bash_command='echo "Bye Bye. check output folder for the results"') t1 >> t2 >> t3
'retries': 5, 'retry_delay': timedelta(minutes=5) } dag = DAG( 'ByName', default_args=default_args, description='Filter by name dag', schedule_interval='@daily', ) t1 = DockerOperator(task_id='DockerOperator', image='faizan-k_devchallenge', api_version='auto', auto_remove=True, command='byname -n "{}"'.format(beer_name), docker_url="unix://var/run/docker.sock", network_mode="bridge", xcom_push=True, dag=dag) def perform_calculation(**context): output = json.loads(context['ti'].xcom_pull(task_ids='DockerOperator')) avg_ibu_ibv = json.dumps({ "avg_ibu": sum([i['ibu'] for i in output]) / (len(output) or 1), "avg_abv": sum([i['abv'] for i in output]) / (len(output) or 1) }) context['ti'].xcom_push(key="AVG_IBU_ABV", value=avg_ibu_ibv)
'retry_delay': timedelta(minutes=15) } with DAG('bag_coronavirus', default_args=default_args, schedule_interval="15 * * * *", catchup=False) as dag: dag.doc_md = __doc__ upload_bag_datasets = DockerOperator( task_id='upload_bag_datasets', image='bag_coronavirus:latest', api_version='auto', auto_remove=True, command= '/bin/bash /code/data-processing/bag_coronavirus/etl_bag_datasets.sh ', container_name='bag_coronavirus--upload_bag_datasets', docker_url="unix://var/run/docker.sock", network_mode="bridge", tty=True, volumes=[ '/data/dev/workspace/data-processing:/code/data-processing', '/mnt/OGD-DataExch/StatA/BAG_Coronavirus_Tests:/code/data-processing/bag_coronavirus/data' ]) upload_vmdl = DockerOperator( task_id='upload_vmdl', image='bag_coronavirus:latest', api_version='auto', auto_remove=True, command='/bin/bash /code/data-processing/bag_coronavirus/etl_vmdl.sh ', container_name='bag_coronavirus--upload_vmdl',
f"atd_mds_{mds_provider}_staging", default_args=default_args, schedule_interval="15 * * * *", catchup=False, tags=["staging", "mds"], ) as dag: # # Task: provider_extract # Description: Given a schedule block, the script extracts data from the MDS provider within the schedule's time window # then it uploads the data into S3 for further processing. # t1 = DockerOperator( task_id='provider_extract', image=docker_image, api_version='auto', auto_remove=True, command= f"./provider_extract.py --provider '{mds_provider}' --time-max '{time_max}' --interval 1", docker_url="tcp://localhost:2376", network_mode="bridge", environment=environment_vars) # # Task: provider_sync_db # Description: Downloads the extracted MDS data from S3, and inserts each trip into a postgres database. # t2 = DockerOperator( task_id='provider_sync_db', image=docker_image, api_version='auto', auto_remove=True, command=
from airflow.operators.docker_operator import DockerOperator default_args = { 'owner': 'airflowMAT', 'description': 'Use of the DockerOperator', 'depend_on_past': False, 'start_date': datetime(2018, 1, 3), 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } with DAG('docker_dag', default_args=default_args, schedule_interval="5 * * * *", catchup=False) as dag: t1 = BashOperator(task_id='print_current_date', bash_command='date') t2 = DockerOperator(task_id='docker_command', image='openjdk:latest', api_version='auto', auto_remove=True, command="/bin/sleep 30", docker_url="unix://var/run/docker.sock", network_mode="bridge") t3 = BashOperator(task_id='print_hello', bash_command='echo "hello world"') t1 >> t2 >> t3
default_args=default_args, schedule_interval="0 8 * * *", catchup=False, tags=["production", "visionzero"], ) as dag: # # Task: docker_command_crashes # Description: Imports a raw CSV file with crash records into our database via GraphSQL/Hasura. # crash = DockerOperator( task_id='docker_command_crashes', image='atddocker/atd-vz-etl:production', api_version='auto', auto_remove=True, command="/app/process_hasura_import.py crash", docker_url="tcp://localhost:2376", network_mode="bridge", environment=atd_visionzero_cris_envvars, volumes=[ atd_visionzero_cris_volumes["ATD_VOLUME_DATA"], atd_visionzero_cris_volumes["ATD_VOLUME_TEMP"], ], ) # # Task: docker_command_unit # Description: Imports a raw CSV file with unit records into our database via GraphSQL/Hasura. # unit = DockerOperator( task_id='docker_command_unit', image='atddocker/atd-vz-etl:production', api_version='auto',
'depends_on_past': False, 'start_date': datetime(2019, 5, 28), 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG('google_timeline_ingestion', description='daily ingestion for google timeline data', default_args=default_args, schedule_interval='0 3 * * *', catchup=False) t_1 = DockerOperator( task_id='initialize_raw_staging', image='tracker-task1', api_version='auto', auto_remove=True, command="python /usr/timeline_tracker/tasks/task_initialize_raw_staging.py", docker_url="unix://var/run/docker.sock", network_mode="host", dag=dag ) t_2 = DockerOperator( task_id='extract_raw_to_staging', image='tracker-task1', api_version='auto', auto_remove=True, command="python /usr/timeline_tracker/tasks/task_extract_raw_to_staging.py", docker_url="unix://var/run/docker.sock", network_mode="host", dag=dag )