Ejemplo n.º 1
0
def make_prediction(model_id, pool_id, default_args,
                    localhost_dir, build_dir, start=None):

    with DAG("update-data-task",
             default_args=default_args,
             schedule_interval="@daily", catchup=True) as dag:

        data_op = DockerOperator(
            task_id="update-data-pool-id",
            image="eugenepy/akira-data:latest",
            api_version="auto",
            command="variable-task update-pool -i {{ params.pool_id }} -s " +
            "{{ (execution_date - macros.timedeltas(days=1)).strftime('%Y%m%d') }" +
            " -e {{ execution_date.strftime('%Y%d%m') }} -l investingdotcom save " +
            "--filename /build/akira_data.test.csv",
            params={"pool_id": pool_id},
            volumes=[f"{localhost_dir}:{build_dir}"],
            network_mode="akira-project_default",
            docker_url="tcp://socat:2375")

        predict_op = DockerOperator(
            task_id="train-model",
            image="eugenepy/basket:latest",
            api_version="auto",
            command="python3 -m baksets predict -m /build/bmk.pkl " +
            "-i /build/akira_data.test.csv -o /build/akira_data.predict.csv",
            volumes=[f"{localhost_dir}:{build_dir}"],
            network_mode="akira-project_default",
            docker_url="tcp://socat:2375")
        data_op >> predict_op
        return dag
Ejemplo n.º 2
0
    def test_execute_tls(self, client_class_mock, tls_class_mock):
        client_mock = mock.Mock(spec=APIClient)
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.create_host_config.return_value = mock.Mock()
        client_mock.images.return_value = []
        client_mock.logs.return_value = []
        client_mock.pull.return_value = []
        client_mock.wait.return_value = 0

        client_class_mock.return_value = client_mock
        tls_mock = mock.Mock()
        tls_class_mock.return_value = tls_mock

        operator = DockerOperator(docker_url='tcp://127.0.0.1:2376',
                                  image='ubuntu',
                                  owner='unittest',
                                  task_id='unittest',
                                  tls_client_cert='cert.pem',
                                  tls_ca_cert='ca.pem',
                                  tls_client_key='key.pem')
        operator.execute(None)

        tls_class_mock.assert_called_with(assert_hostname=None,
                                          ca_cert='ca.pem',
                                          client_cert=('cert.pem', 'key.pem'),
                                          ssl_version=None,
                                          verify=True)

        client_class_mock.assert_called_with(base_url='https://127.0.0.1:2376',
                                             tls=tls_mock,
                                             version=None)
    def test_execute_no_docker_conn_id_no_hook(self, operator_client_mock):
        # Mock out a Docker client, so operations don't raise errors
        client_mock = mock.Mock(name='DockerOperator.APIClient mock', spec=APIClient)
        client_mock.images.return_value = []
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.logs.return_value = []
        client_mock.pull.return_value = []
        client_mock.wait.return_value = {"StatusCode": 0}
        operator_client_mock.return_value = client_mock

        # Create the DockerOperator
        operator = DockerOperator(
            image='publicregistry/someimage',
            owner='unittest',
            task_id='unittest'
        )

        # Mock out the DockerHook
        hook_mock = mock.Mock(name='DockerHook mock', spec=DockerHook)
        hook_mock.get_conn.return_value = client_mock
        operator.get_hook = mock.Mock(
            name='DockerOperator.get_hook mock',
            spec=DockerOperator.get_hook,
            return_value=hook_mock
        )

        operator.execute(None)
        self.assertEqual(
            operator.get_hook.call_count, 0,
            'Hook called though no docker_conn_id configured'
        )
Ejemplo n.º 4
0
def train_model(model_image, model_id, pool_id, start, default_args):

    with DAG("update-data-task",
             default_args=default_args,
             schedule_interval="@daily", catchup=True) as dag:

        data_op = DockerOperator(
            task_id="update-data-pool-id",
            image="eugenepy/akira-data:latest",
            api_version="auto",
            command="variable-task update-pool -i {{ params.pool_id }} -s " +
            "{{ params.start }} -e {{ ds_nodash }} -l investingdotcom save " +
            "--filename /build/akira_data.csv",
            params={"start": start, "pool_id": pool_id},
            volumes=[f"{localhost_dir}:{build_dir}"],
            network_mode="akira-project_default",
            docker_url="tcp://socat:2375")

        train_op = DockerOperator(
            task_id="train-model",
            image=model_image,
            api_version="auto",
            command="make -f Makefile.model train_bmk",
            volumes=[f"{localhost_dir}:{build_dir}"],
            network_mode="akira-project_default",
            docker_url="tcp://socat:2375")
        data_op >> train_op
        return dag
Ejemplo n.º 5
0
    def test_execute_with_docker_conn_id_use_hook(self, operator_client_mock,
                                                  operator_docker_hook):
        # Mock out a Docker client, so operations don't raise errors
        client_mock = mock.Mock(name='DockerOperator.APIClient mock',
                                spec=APIClient)
        client_mock.images.return_value = []
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.logs.return_value = []
        client_mock.pull.return_value = []
        client_mock.wait.return_value = 0
        operator_client_mock.return_value = client_mock

        # Create the DockerOperator
        operator = DockerOperator(image='publicregistry/someimage',
                                  owner='unittest',
                                  task_id='unittest',
                                  docker_conn_id='some_conn_id')

        # Mock out the DockerHook
        hook_mock = mock.Mock(name='DockerHook mock', spec=DockerHook)
        hook_mock.get_conn.return_value = client_mock
        operator_docker_hook.return_value = hook_mock

        operator.execute(None)

        self.assertEqual(
            operator_client_mock.call_count, 0,
            'APIClient was called on the operator instead of the hook')
        self.assertEqual(
            operator_docker_hook.call_count, 1,
            'Hook was not called although docker_conn_id configured')
        self.assertEqual(client_mock.pull.call_count, 1,
                         'Image was not pulled using operator client')
    def test_execute_no_docker_conn_id_no_hook(self, operator_client_mock):
        # Mock out a Docker client, so operations don't raise errors
        client_mock = mock.Mock(name='DockerOperator.APIClient mock', spec=APIClient)
        client_mock.images.return_value = []
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.attach.return_value = []
        client_mock.pull.return_value = []
        client_mock.wait.return_value = {"StatusCode": 0}
        operator_client_mock.return_value = client_mock

        # Create the DockerOperator
        operator = DockerOperator(
            image='publicregistry/someimage',
            owner='unittest',
            task_id='unittest'
        )

        # Mock out the DockerHook
        hook_mock = mock.Mock(name='DockerHook mock', spec=DockerHook)
        hook_mock.get_conn.return_value = client_mock
        operator.get_hook = mock.Mock(
            name='DockerOperator.get_hook mock',
            spec=DockerOperator.get_hook,
            return_value=hook_mock
        )

        operator.execute(None)
        self.assertEqual(
            operator.get_hook.call_count, 0,
            'Hook called though no docker_conn_id configured'
        )
    def test_execute(self, client_class_mock, mkdtemp_mock):
        host_config = mock.Mock()
        mkdtemp_mock.return_value = '/mkdtemp'

        client_mock = mock.Mock(spec=APIClient)
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.create_host_config.return_value = host_config
        client_mock.images.return_value = []
        client_mock.logs.return_value = ['container log']
        client_mock.pull.return_value = [b'{"status":"pull log"}']
        client_mock.wait.return_value = {"StatusCode": 0}

        client_class_mock.return_value = client_mock

        operator = DockerOperator(api_version='1.19',
                                  command='env',
                                  environment={'UNIT': 'TEST'},
                                  image='ubuntu:latest',
                                  network_mode='bridge',
                                  owner='unittest',
                                  task_id='unittest',
                                  volumes=['/host/path:/container/path'],
                                  working_dir='/container/path',
                                  shm_size=1000,
                                  host_tmp_dir='/host/airflow')
        operator.execute(None)

        client_class_mock.assert_called_with(
            base_url='unix://var/run/docker.sock', tls=None, version='1.19')

        client_mock.create_container.assert_called_with(
            command='env',
            environment={
                'AIRFLOW_TMP_DIR': '/tmp/airflow',
                'UNIT': 'TEST'
            },
            host_config=host_config,
            image='ubuntu:latest',
            user=None,
            working_dir='/container/path')
        client_mock.create_host_config.assert_called_with(
            binds=['/host/path:/container/path', '/mkdtemp:/tmp/airflow'],
            network_mode='bridge',
            shm_size=1000,
            cpu_shares=1024,
            mem_limit=None,
            auto_remove=False,
            dns=None,
            dns_search=None)
        mkdtemp_mock.assert_called_with(dir='/host/airflow',
                                        prefix='airflowtmp',
                                        suffix='')
        client_mock.images.assert_called_with(name='ubuntu:latest')
        client_mock.logs.assert_called_with(container='some_id', stream=True)
        client_mock.pull.assert_called_with('ubuntu:latest', stream=True)
        client_mock.wait.assert_called_with('some_id')
    def test_on_kill():
        client_mock = mock.Mock(spec=APIClient)

        operator = DockerOperator(image='ubuntu', owner='unittest', task_id='unittest')
        operator.cli = client_mock
        operator.container = {'Id': 'some_id'}

        operator.on_kill()

        client_mock.stop.assert_called_with('some_id')
    def test_execute_container_fails(self, client_class_mock):
        client_mock = mock.Mock(spec=APIClient)
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.create_host_config.return_value = mock.Mock()
        client_mock.images.return_value = []
        client_mock.logs.return_value = []
        client_mock.pull.return_value = []
        client_mock.wait.return_value = {"StatusCode": 1}

        client_class_mock.return_value = client_mock

        operator = DockerOperator(image='ubuntu', owner='unittest', task_id='unittest')

        with self.assertRaises(AirflowException):
            operator.execute(None)
    def test_execute_container_fails(self, client_class_mock):
        client_mock = mock.Mock(spec=APIClient)
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.create_host_config.return_value = mock.Mock()
        client_mock.images.return_value = []
        client_mock.attach.return_value = []
        client_mock.pull.return_value = []
        client_mock.wait.return_value = {"StatusCode": 1}

        client_class_mock.return_value = client_mock

        operator = DockerOperator(image='ubuntu', owner='unittest', task_id='unittest')

        with self.assertRaises(AirflowException):
            operator.execute(None)
Ejemplo n.º 11
0
 def make_etl_operator(task_id: str, operation: str):
     cmd = f"'etl --redis-url redis:6379 {operation}'"
     return DockerOperator(command=cmd,
                           environment={"PYTHONUNBUFFERED": 1},
                           task_id=task_id,
                           image=f"etl-dummy:latest",
                           auto_remove=True,
                           network_mode="airflow-tutorial_default")
Ejemplo n.º 12
0
def get_task(activity, city):
    return DockerOperator(task_id=f'mine_{activity}_{city}',
                          image='tahasadiki/telecontact-scraper:latest',
                          api_version='auto',
                          auto_remove=True,
                          command=f"{activity} {city}",
                          docker_url="unix://var/run/docker.sock",
                          network_mode="bridge")
    def test_execute(self, client_class_mock, mkdtemp_mock):
        host_config = mock.Mock()
        mkdtemp_mock.return_value = '/mkdtemp'

        client_mock = mock.Mock(spec=APIClient)
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.create_host_config.return_value = host_config
        client_mock.images.return_value = []
        client_mock.logs.return_value = ['container log']
        client_mock.pull.return_value = [b'{"status":"pull log"}']
        client_mock.wait.return_value = {"StatusCode": 0}

        client_class_mock.return_value = client_mock

        operator = DockerOperator(api_version='1.19', command='env', environment={'UNIT': 'TEST'},
                                  image='ubuntu:latest', network_mode='bridge', owner='unittest',
                                  task_id='unittest', volumes=['/host/path:/container/path'],
                                  working_dir='/container/path', shm_size=1000)
        operator.execute(None)

        client_class_mock.assert_called_with(base_url='unix://var/run/docker.sock', tls=None,
                                             version='1.19')

        client_mock.create_container.assert_called_with(command='env',
                                                        environment={
                                                            'AIRFLOW_TMP_DIR': '/tmp/airflow',
                                                            'UNIT': 'TEST'
                                                        },
                                                        host_config=host_config,
                                                        image='ubuntu:latest',
                                                        user=None,
                                                        working_dir='/container/path'
                                                        )
        client_mock.create_host_config.assert_called_with(binds=['/host/path:/container/path',
                                                                 '/mkdtemp:/tmp/airflow'],
                                                          network_mode='bridge',
                                                          shm_size=1000,
                                                          cpu_shares=1024,
                                                          mem_limit=None,
                                                          auto_remove=False,
                                                          dns=None,
                                                          dns_search=None)
        client_mock.images.assert_called_with(name='ubuntu:latest')
        client_mock.logs.assert_called_with(container='some_id', stream=True)
        client_mock.pull.assert_called_with('ubuntu:latest', stream=True)
        client_mock.wait.assert_called_with('some_id')
def mapping(dict, dag1):

    if not dict['ttl']:
      t1 = DockerOperator(
            task_id=dict['task_id'],
            image=dict['image'],
            command=eval(dict['command']),
            xcom_push=bool(dict['xcom_push']),
            dag=dag1)
        return [t1]
    def test_execute_unicode_logs(self, client_class_mock):
        client_mock = mock.Mock(spec=APIClient)
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.create_host_config.return_value = mock.Mock()
        client_mock.images.return_value = []
        client_mock.attach.return_value = ['unicode container log 😁']
        client_mock.pull.return_value = []
        client_mock.wait.return_value = {"StatusCode": 0}

        client_class_mock.return_value = client_mock

        originalRaiseExceptions = logging.raiseExceptions  # pylint: disable=invalid-name
        logging.raiseExceptions = True

        operator = DockerOperator(image='ubuntu', owner='unittest', task_id='unittest')

        with mock.patch('traceback.print_exception') as print_exception_mock:
            operator.execute(None)
            logging.raiseExceptions = originalRaiseExceptions
            print_exception_mock.assert_not_called()
    def test_execute_unicode_logs(self, client_class_mock):
        client_mock = mock.Mock(spec=APIClient)
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.create_host_config.return_value = mock.Mock()
        client_mock.images.return_value = []
        client_mock.logs.return_value = ['unicode container log 😁']
        client_mock.pull.return_value = []
        client_mock.wait.return_value = {"StatusCode": 0}

        client_class_mock.return_value = client_mock

        originalRaiseExceptions = logging.raiseExceptions
        logging.raiseExceptions = True

        operator = DockerOperator(image='ubuntu', owner='unittest', task_id='unittest')

        with mock.patch('traceback.print_exception') as print_exception_mock:
            operator.execute(None)
            logging.raiseExceptions = originalRaiseExceptions
            print_exception_mock.assert_not_called()
Ejemplo n.º 17
0
    def test_execute_with_docker_conn_id_use_hook(self, operator_client_mock):
        # Mock out a Docker client, so operations don't raise errors
        client_mock = mock.Mock(name='DockerOperator.Client mock', spec=Client)
        client_mock.images.return_value = []
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.logs.return_value = []
        client_mock.pull.return_value = []
        client_mock.wait.return_value = 0
        operator_client_mock.return_value = client_mock

        # Create the DockerOperator
        operator = DockerOperator(
            image='publicregistry/someimage',
            owner='unittest',
            task_id='unittest',
            docker_conn_id='some_conn_id'
        )

        # Mock out the DockerHook
        hook_mock = mock.Mock(name='DockerHook mock', spec=DockerHook)
        hook_mock.get_conn.return_value = client_mock
        operator.get_hook = mock.Mock(
            name='DockerOperator.get_hook mock',
            spec=DockerOperator.get_hook,
            return_value=hook_mock
        )

        operator.execute(None)
        self.assertEqual(
            operator_client_mock.call_count, 0,
            'Client was called on the operator instead of the hook'
        )
        self.assertEqual(
            operator.get_hook.call_count, 1,
            'Hook was not called although docker_conn_id configured'
        )
        self.assertEqual(
            client_mock.pull.call_count, 1,
            'Image was not pulled using operator client'
        )
Ejemplo n.º 18
0
def create_docker_operator(params):
    """Create DockerOperator with default kwargs."""
    # Create defaults.
    defaults = {
        'remove': True,
        'xcom_push': True,
        'volumes': ['/var/log/filebeat:/usr/local/src/log']
    }

    # Merge params.
    docker_params = defaults.copy()
    docker_params.update(params)

    # Return a new DockerOperator.
    return DockerOperator(**docker_params)
    def test_execute_tls(self, client_class_mock, tls_class_mock):
        client_mock = mock.Mock(spec=APIClient)
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.create_host_config.return_value = mock.Mock()
        client_mock.images.return_value = []
        client_mock.logs.return_value = []
        client_mock.pull.return_value = []
        client_mock.wait.return_value = {"StatusCode": 0}

        client_class_mock.return_value = client_mock
        tls_mock = mock.Mock()
        tls_class_mock.return_value = tls_mock

        operator = DockerOperator(docker_url='tcp://127.0.0.1:2376', image='ubuntu',
                                  owner='unittest', task_id='unittest', tls_client_cert='cert.pem',
                                  tls_ca_cert='ca.pem', tls_client_key='key.pem')
        operator.execute(None)

        tls_class_mock.assert_called_with(assert_hostname=None, ca_cert='ca.pem',
                                          client_cert=('cert.pem', 'key.pem'),
                                          ssl_version=None, verify=True)

        client_class_mock.assert_called_with(base_url='https://127.0.0.1:2376',
                                             tls=tls_mock, version=None)
    def test_on_kill():
        client_mock = mock.Mock(spec=APIClient)

        operator = DockerOperator(image='ubuntu', owner='unittest', task_id='unittest')
        operator.cli = client_mock
        operator.container = {'Id': 'some_id'}

        operator.on_kill()

        client_mock.stop.assert_called_with('some_id')
Ejemplo n.º 21
0
def dump_pool_file_to_arctic_subdag(name, pool_id, index_col, symbol_header, field_header, 
                                    start, localhost_dir):
    # we update data at every morning 4AM@utc+8
    build_dir = "/build"
    data_op = DockerOperator(
                        task_id=name,
                        image="eugenepy/akira-data:latest",
                        api_version="auto",
                        command="python -m akira_data variable-task update-pool -i {{ params.pool_id }} -s " +
                        "{{ params.start }} -e {{ ds_nodash }} -l investingdotcom save " + \
                        "--filename {{ params.build_dir }}/{{params.pool_id}}.{{ ds_nodash }}.csv",
                        params={"start": start, "pool_id": pool_id, 
                                "build_dir": build_dir},
                        volumes=[f"{localhost_dir}:{build_dir}"], # save at airflow's container?
                        network_mode=network, # connect2mongodb
                        docker_url="tcp://socat:2375", 
                        auto_remove=True)
    return data_op
Ejemplo n.º 22
0
def caom_commands(artifact, **kwargs):
    uri_list = "{{ task_instance.xcom_pull(task_ids='get_observations') }}"
    # return PythonOperator(python_callable=do_that, provide_context=True,
    #                       task_id='meta_{}'.format(artifact),
    #                       dag=poc_dag, op_kwargs={'artifact': artifact})

    # file not found error
    # x = DockerOperator(docker_url='unix:///var/run/docker.sock',
    # connection refused
    # x = DockerOperator(docker_url='tcp://localhost:2375',
    # connection refused
    x = DockerOperator(docker_url='tcp://localhost:2376',
                       command='omm_run {}'.format(artifact),
                       image='opencadc/omm2caom2',
                       network_mode='bridge',
                       task_id='meta_{}'.format(artifact),
                       docker_conn_id='my_docker',
                       dag=poc_dag)
    return x
Ejemplo n.º 23
0
def _create_task(task_id, dag, image, command, environment):
    env = {
        'WAREHOUSE_URL': get_postgres_uri('warehouse_db'),
        'DATABASE_URL': get_postgres_uri('api_db'),
        'EXPLORERDB_URL': get_postgres_uri('explorer_db'),
        'PYTHON_ENV': airflow.models.Variable.get('ENV'),
        'LOGGING_URL': airflow.models.Variable.get('LOGGING_URL'),
        'DOWNLOAD_DELAY': airflow.models.Variable.get('DOWNLOAD_DELAY'),
    }
    env.update(environment)
    docker_api_version = os.environ.get('DOCKER_API_VERSION', '1.23')

    return DockerOperator(
        task_id=task_id,
        dag=dag,
        image=image,
        command=command,
        environment=env,
        api_version=docker_api_version,
        force_pull=True,
    )
Ejemplo n.º 24
0
    def test_execute_xcom_behavior(self, client_class_mock, tempdir_mock):
        tempdir_mock.return_value.__enter__.return_value = '/mkdtemp'

        client_mock = mock.Mock(spec=APIClient)
        client_mock.images.return_value = []
        client_mock.create_container.return_value = {'Id': 'some_id'}
        client_mock.attach.return_value = ['container log']
        client_mock.pull.return_value = [b'{"status":"pull log"}']
        client_mock.wait.return_value = {"StatusCode": 0}

        client_class_mock.return_value = client_mock

        kwargs = {
            'api_version': '1.19',
            'command': 'env',
            'environment': {
                'UNIT': 'TEST'
            },
            'image': 'ubuntu:latest',
            'network_mode': 'bridge',
            'owner': 'unittest',
            'task_id': 'unittest',
            'volumes': ['/host/path:/container/path'],
            'working_dir': '/container/path',
            'shm_size': 1000,
            'host_tmp_dir': '/host/airflow',
            'container_name': 'test_container',
            'tty': True,
        }

        xcom_push_operator = DockerOperator(**kwargs, do_xcom_push=True)
        no_xcom_push_operator = DockerOperator(**kwargs, do_xcom_push=False)

        xcom_push_result = xcom_push_operator.execute(None)
        no_xcom_push_result = no_xcom_push_operator.execute(None)

        self.assertEqual(xcom_push_result, b'container log')
        self.assertIs(no_xcom_push_result, None)
Ejemplo n.º 25
0
    dag_id="atd_knack_signal_work_orders",
    default_args=default_args,
    schedule_interval="50 8 * * *",
    dagrun_timeout=timedelta(minutes=60),
    tags=["production", "knack"],
    catchup=False,
) as dag:
    # completely replace data on 15th day of every month
    # this is a failsafe catch records that may have been missed via incremental loading
    date_filter = "{{ '1970-01-01' if ds.endswith('15') else prev_execution_date_success or '1970-01-01' }}"  # noqa:E501
    t1 = DockerOperator(
        task_id="atd_knack_traffic_signal_work_orders_to_postgrest",
        image=docker_image,
        api_version="auto",
        auto_remove=True,
        command=f'./atd-knack-services/services/{script_task_1}.py -a {app_name} -c {container} -d "{date_filter}"',  # noqa:E501
        docker_url="tcp://localhost:2376",
        network_mode="bridge",
        environment=env_vars,
        tty=True,
    )

    t2 = DockerOperator(
        task_id="atd_knack_traffic_signal_work_orders_to_socrata",
        image=docker_image,
        api_version="auto",
        auto_remove=True,
        command=f'./atd-knack-services/services/{script_task_2}.py -a {app_name} -c {container} -d "{date_filter}"',  # noqa
        docker_url="tcp://localhost:2376",
        network_mode="bridge",
        environment=env_vars,
        f"atd_mds_{mds_provider}_staging",
        default_args=default_args,
        schedule_interval="15 * * * *",
        catchup=False,
        tags=["staging", "mds"],
) as dag:
    #
    # Task: provider_extract
    # Description: Given a schedule block, the script extracts data from the MDS provider within the schedule's time window
    # then it uploads the data into S3 for further processing.
    #
    t1 = DockerOperator(
        task_id='provider_extract',
        image=docker_image,
        api_version='auto',
        auto_remove=True,
        command=
        f"./provider_extract.py --provider '{mds_provider}' --time-max '{time_max}' --interval 1",
        docker_url="tcp://localhost:2376",
        network_mode="bridge",
        environment=environment_vars)

    #
    # Task: provider_sync_db
    # Description: Downloads the extracted MDS data from S3, and inserts each trip into a postgres database.
    #
    t2 = DockerOperator(
        task_id='provider_sync_db',
        image=docker_image,
        api_version='auto',
        auto_remove=True,
        command=
Ejemplo n.º 27
0
* Moves WARCs from warcprox into the right place in the /heritrix/output folders.
* **TBA** 'Closes' WARCs that are .open, if they are older than a few days.

Configuration:

* The tasks are configured to scan `/mnt/gluster/fc`.
* The push gateway is configured to be `{c.push_gateway}`.

How to check it's working:

* Task Instance logs show how many WARCs were moved.
* Prometheus updated via Push Gateway with `ukwa_files_moved_total_count{{kind='warcprox-warcs'}}` counts.
    * Look for job results in [the push gateway configured for this task](http://{c.push_gateway}).
    * For example results from Prometheus in production, see [here](http://monitor-prometheus.api.wa.bl.uk/graph?g0.expr=ukwa_files_moved_total_count{{kind='warcprox-warc'}}&g0.tab=0&g0.stacked=0&g0.range_input=4w).

"""

    tidy = DockerOperator(
        task_id='move-warcprox-warcs',
        image=c.ukwa_task_image,
        command='store -v warctidy',
        user=0,  # Run as root due to file permissions
        volumes=['/mnt/gluster/fc:/mnt/gluster/fc'],
        environment={
            'PUSH_GATEWAY': c.push_gateway,
        },
        tty=True,  # <-- So we see logging
        do_xcom_push=False,
    )
Ejemplo n.º 28
0
    'retries': 5,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG(
    'ByName',
    default_args=default_args,
    description='Filter by name dag',
    schedule_interval='@daily',
)

t1 = DockerOperator(task_id='DockerOperator',
                    image='faizan-k_devchallenge',
                    api_version='auto',
                    auto_remove=True,
                    command='byname -n "{}"'.format(beer_name),
                    docker_url="unix://var/run/docker.sock",
                    network_mode="bridge",
                    xcom_push=True,
                    dag=dag)


def perform_calculation(**context):
    output = json.loads(context['ti'].xcom_pull(task_ids='DockerOperator'))
    avg_ibu_ibv = json.dumps({
        "avg_ibu":
        sum([i['ibu'] for i in output]) / (len(output) or 1),
        "avg_abv":
        sum([i['abv'] for i in output]) / (len(output) or 1)
    })
    context['ti'].xcom_push(key="AVG_IBU_ABV", value=avg_ibu_ibv)
}

with DAG('fix_s3_recording_url_pipeline',
         default_args=default_args,
         schedule_interval='*/10 * * * *',
         catchup=False) as dag:

    t1 = BashOperator(
        task_id='login_aws',
        bash_command=
        '$(aws ecr get-login --region eu-west-1 --no-include-email)')

    t2 = DockerOperator(
        task_id='fix_s3_recording_url_pipeline',
        auto_remove=True,
        image=IMAGE_NAME,
        api_version='auto',
        command=COMMAND,
        docker_url='unix://var/run/docker.sock',
        network_mode='host',
        environment={
            'DATABASE_HOST': DATABASE_HOST,
            'ELASTICSEARCH_URL': ELASTICSEARCH_URL,
            'DYNAMODB_HOST': DYNAMODB_HOST,
        },
        volumes=[LOG_DIRECTORY, BOTO_CREDENTIAL],
        force_pull=True,
    )

    t2.set_upstream(t1)
Ejemplo n.º 30
0
    'on_failure_callback': send_alert_task_failure_to_slack
}

with DAG('sync_country_from_zendesk_pipeline',
         default_args=default_args,
         schedule_interval="0 0 * * *",
         catchup=False) as dag:

    t1 = BashOperator(
        task_id='login_aws',
        bash_command=
        '$(aws ecr get-login --region eu-west-1 --no-include-email)')

    t2 = DockerOperator(
        task_id='sync_country_from_zendesk_pipeline',
        auto_remove=True,
        image=IMAGE_NAME,
        api_version='auto',
        command=COMMAND,
        docker_url='unix://var/run/docker.sock',
        network_mode='host',
        environment={
            'DATABASE_HOST': DATABASE_HOST,
            'ELASTICSEARCH_URL': ELASTICSEARCH_URL,
            'DYNAMODB_HOST': DYNAMODB_HOST
        },
        volumes=[LOG_DIRECTORY],
        force_pull=True,
    )

    t2.set_upstream(t1)
Ejemplo n.º 31
0
         default_args=default_args,
         schedule_interval='*/15 * * * *',
         catchup=False) as dag:

    t1 = BashOperator(
        task_id='print_start_time',
        bash_command='echo `date "+%Y-%m-%d%H:%M:%S"` "- Airflow Task Started"'
    )

    t2 = DockerOperator(task_id='docker_command',
                        image='entechlog/weather-alert-app:latest',
                        api_version='auto',
                        auto_remove=True,
                        docker_url="unix://var/run/docker.sock",
                        network_mode="weatheralertapp_default",
                        environment={
                            'bootstrap_servers': "broker:9092",
                            'schema_registry_url':
                            "http://schema-registry:8081",
                            'topic_name': "weather.alert.app.source",
                            'lat': "8.270272",
                            'lon': "77.177274",
                            'OPEN_WEATHER_API_KEY': ""
                        })

    t3 = BashOperator(
        task_id='print_end_time',
        bash_command='echo `date "+%Y-%m-%d%H:%M:%S"` "- Airflow Task Finished"'
    )

    t1 >> t2 >> t3
env_vars["SOCRATA_API_KEY_ID"] = Variable.get("atd_service_bot_socrata_api_key_id")
env_vars["SOCRATA_API_KEY_SECRET"] = Variable.get(
    "atd_service_bot_socrata_api_key_secret"
)
env_vars["SOCRATA_APP_TOKEN"] = Variable.get("atd_service_bot_socrata_app_token")

with DAG(
    dag_id="atd_kits_sig_stat_pub",
    default_args=default_args,
    schedule_interval="*/5 * * * *",
    dagrun_timeout=timedelta(minutes=60),
    tags=["production", "socrata", "kits"],
    catchup=False,
) as dag:
    t1 = DockerOperator(
        task_id="atd_kits_sig_status_to_socrata",
        image=docker_image,
        api_version="auto",
        auto_remove=True,
        command="./atd-kits/atd-kits/signal_status_publisher.py",
        docker_url="tcp://localhost:2376",
        network_mode="bridge",
        environment=env_vars,
        tty=True,
    )

    t1

if __name__ == "__main__":
    dag.cli()
Ejemplo n.º 33
0
        default_args=default_args,
        schedule_interval="0 8 * * *",
        catchup=False,
        tags=["production", "visionzero"],
) as dag:
        #
        # Task: docker_command_crashes
        # Description: Imports a raw CSV file with crash records into our database via GraphSQL/Hasura.
        #
        crash = DockerOperator(
                task_id='docker_command_crashes',
                image='atddocker/atd-vz-etl:production',
                api_version='auto',
                auto_remove=True,
                command="/app/process_hasura_import.py crash",
                docker_url="tcp://localhost:2376",
                network_mode="bridge",
                environment=atd_visionzero_cris_envvars,
                volumes=[
                        atd_visionzero_cris_volumes["ATD_VOLUME_DATA"],
                        atd_visionzero_cris_volumes["ATD_VOLUME_TEMP"],
                ],
        )

        #
        # Task: docker_command_unit
        # Description: Imports a raw CSV file with unit records into our database via GraphSQL/Hasura.
        #
        unit = DockerOperator(
                task_id='docker_command_unit',
                image='atddocker/atd-vz-etl:production',
                api_version='auto',
Ejemplo n.º 34
0
    'dataflow_default_options': {
        'project': os.environ['GCP_PROJECT']
    }
}

dag = DAG('firearm_seizures',
          default_args=default_args,
          schedule_interval='@monthly')

gcs_load = DockerOperator(
    task_id='firearms_gcs_docker',
    image='gcr.io/data-rivers/pgh-firearms',
    api_version='auto',
    auto_remove=True,
    environment={
        'APRS_UN': os.environ['APRS_UN'],
        'APRS_PW': os.environ['APRS_PW'],
        'GCS_AUTH_FILE':
        '/root/firearm-seizures-report/data-rivers-service-acct.json',
        'GCS_PREFIX': os.environ['GCS_PREFIX']
    },
    dag=dag)

# dataflow_task = DataFlowPythonOperator(
#     task_id='firearms_dataflow',
#     job_name='firearms-dataflow',
#     py_file=os.getcwd() + '/airflow_scripts/dags/dependencies/dataflow_scripts/firearms_dataflow.py'),
#     dag=dag
# )

dataflow_task = BashOperator(
Ejemplo n.º 35
0
from airflow.operators.docker_operator import DockerOperator

default_args = {
    'owner': 'airflowMAT',
    'description': 'Use of the DockerOperator',
    'depend_on_past': False,
    'start_date': datetime(2018, 1, 3),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

with DAG('docker_dag',
         default_args=default_args,
         schedule_interval="5 * * * *",
         catchup=False) as dag:
    t1 = BashOperator(task_id='print_current_date', bash_command='date')

    t2 = DockerOperator(task_id='docker_command',
                        image='openjdk:latest',
                        api_version='auto',
                        auto_remove=True,
                        command="/bin/sleep 30",
                        docker_url="unix://var/run/docker.sock",
                        network_mode="bridge")

    t3 = BashOperator(task_id='print_hello', bash_command='echo "hello world"')

    t1 >> t2 >> t3
Ejemplo n.º 36
0
    'retry_delay': timedelta(minutes=15)
}

with DAG('bag_coronavirus',
         default_args=default_args,
         schedule_interval="15 * * * *",
         catchup=False) as dag:
    dag.doc_md = __doc__
    upload_bag_datasets = DockerOperator(
        task_id='upload_bag_datasets',
        image='bag_coronavirus:latest',
        api_version='auto',
        auto_remove=True,
        command=
        '/bin/bash /code/data-processing/bag_coronavirus/etl_bag_datasets.sh ',
        container_name='bag_coronavirus--upload_bag_datasets',
        docker_url="unix://var/run/docker.sock",
        network_mode="bridge",
        tty=True,
        volumes=[
            '/data/dev/workspace/data-processing:/code/data-processing',
            '/mnt/OGD-DataExch/StatA/BAG_Coronavirus_Tests:/code/data-processing/bag_coronavirus/data'
        ])

    upload_vmdl = DockerOperator(
        task_id='upload_vmdl',
        image='bag_coronavirus:latest',
        api_version='auto',
        auto_remove=True,
        command='/bin/bash /code/data-processing/bag_coronavirus/etl_vmdl.sh ',
        container_name='bag_coronavirus--upload_vmdl',
Ejemplo n.º 37
0
fda_linker_task = SubDagOperator(
    dag=dag,
    subdag=fda_dap(parent_dag_name='fda',
                   child_dag_name='linker',
                   start_date=dag.start_date,
                   schedule_interval=dag.schedule_interval),
    task_id='linker',
)

remove_unknown_documentcloud_docs_task = DockerOperator(
    task_id='remove_unknown_documentcloud_docs',
    dag=dag,
    image='opentrials/processors:latest',
    force_pull=True,
    api_version='1.23',
    environment={
        'WAREHOUSE_URL': helpers.get_postgres_uri('warehouse_db'),
        'DATABASE_URL': helpers.get_postgres_uri('api_db'),
        'EXPLORERDB_URL': helpers.get_postgres_uri('explorer_db'),
        'LOGGING_URL': Variable.get('LOGGING_URL'),
        'DOCUMENTCLOUD_USERNAME': Variable.get('DOCUMENTCLOUD_USERNAME'),
        'DOCUMENTCLOUD_PASSWORD': Variable.get('DOCUMENTCLOUD_PASSWORD'),
        'DOCUMENTCLOUD_PROJECT': Variable.get('DOCUMENTCLOUD_PROJECT'),
        'FERNET_KEY': os.environ['FERNET_KEY'],
    },
    command='make start remove_unknown_documentcloud_docs')

remove_unknown_documentcloud_docs_task.set_upstream(fda_linker_task)
fda_linker_task.set_upstream(fda_dap_task)
Ejemplo n.º 38
0
    'docker_sample', default_args=default_args, schedule_interval=timedelta(minutes=10))

t1 = BashOperator(
    task_id='print_date',
    bash_command='date',
    dag=dag)

t2 = BashOperator(
    task_id='sleep',
    bash_command='sleep 5',
    retries=3,
    dag=dag)

t3 = DockerOperator(api_version='1.19',
    docker_url='tcp://localhost:2375', #Set your docker URL
    command='/bin/sleep 30',
    image='centos:latest',
    network_mode='bridge',
    task_id='docker_op_tester',
    dag=dag)


t4 = BashOperator(
    task_id='print_hello',
    bash_command='echo "hello world!!!"',
    dag=dag)


t1.set_downstream(t2)
t1.set_downstream(t3)
t3.set_downstream(t4)