Exemple #1
0
def create_dag(data_marker_config, default_args):
    dag = DAG(
        dag_id="data_marker_pipeline",
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_args,
        start_date=YESTERDAY,
    )

    with dag:
        before_start = PythonOperator(
            task_id="data_marking_start",
            python_callable=data_marking_start,
            op_kwargs={},
        )

        before_start

        for source in data_marker_config.keys():
            filter_by_config = data_marker_config.get(source)
            language = filter_by_config.get("language").lower()
            print(f"Language for source is {language}")
            data_marker_task = kubernetes_pod_operator.KubernetesPodOperator(
                task_id=f"data-marker-{source}",
                name="data-marker",
                cmds=[
                    "python",
                    "invocation_script.py",
                    "-b",
                    bucket_name,
                    "-a",
                    "data_marking",
                    "-rc",
                    f"data/audiotospeech/config/config.yaml",
                    "-as",
                    source,
                    "-fb",
                    json.dumps(filter_by_config),
                    "-l",
                    language,
                ],
                namespace=composer_namespace,
                startup_timeout_seconds=300,
                secrets=[secret_file],
                image=
                f"us.gcr.io/ekstepspeechrecognition/ekstep_data_pipelines:{env_name}_1.0.0",
                image_pull_policy="Always",
            )

            before_start >> data_marker_task

    return dag
def gen_kubernets_operator_python(input_path, output_path, exec_file_name,
                                  requirement_file_path, task_name):
    return kubernetes_pod_operator.KubernetesPodOperator(
        task_id=task_name,
        name=task_name,
        namespace='airflowsys',
        in_cluster=True,
        env_vars={
            'INPUT_PATH': str(input_path),
            'OUTPUT_PATH': output_path,
            'EXEC_FILE_NAME': str(exec_file_name),
            'REQUIREMENT_FILE_PATH': str(requirement_file_path)
        },
        is_delete_operator_pod=True,
        image="reg.sfai.ml/library/airflow/python:3.7-slim-buster")
def create_dag(dag_id, dag_number, default_args, args, batch_count):
    dag = DAG(
        dag_id,
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_args,
        start_date=YESTERDAY,
    )

    with dag:

        audio_format = args.get("audio_format")
        language = args.get("language")
        print(args)
        print(f"Language for source is {language}")
        source_path_for_snr_set = interpolate_language_paths(language)

        get_file_path_from_gcp_bucket = PythonOperator(
            task_id=dag_id + "_get_file_path",
            python_callable=get_file_path_from_bucket,
            op_kwargs={
                "source": dag_id,
                "source_landing_path": source_path_for_snr_set,
                "batch_count": batch_count,
                "audio_format": audio_format,
                "meta_file_extension": meta_file_extension,
                "bucket_name": bucket_name,
            },
            dag_number=dag_number,
        )

        get_file_path_from_gcp_bucket

        parallelism = args.get("parallelism")

        file_path_list = json.loads(Variable.get("audiofilelist"))[dag_id]

        if len(file_path_list) > 0:

            chunk_size = math.ceil(len(file_path_list) / parallelism)
            batches = [
                file_path_list[i:i + chunk_size]
                for i in range(0, len(file_path_list), chunk_size)
            ]
            data_prep_cataloguer = kubernetes_pod_operator.KubernetesPodOperator(
                task_id="data-catalogure",
                name="data-catalogure",
                cmds=[
                    "python",
                    "invocation_script.py",
                    "-b",
                    bucket_name,
                    "-a",
                    "audio_cataloguer",
                    "-rc",
                    f"data/audiotospeech/config/config.yaml",
                ],
                namespace=composer_namespace,
                startup_timeout_seconds=300,
                secrets=[secret_file],
                image=
                f"us.gcr.io/ekstepspeechrecognition/ekstep_data_pipelines:{env_name}_1.0.0",
                image_pull_policy="Always",
                resources=resource_limits,
            )

        else:
            batches = []

        for batch_file_path_list in batches:
            data_prep_task = kubernetes_pod_operator.KubernetesPodOperator(
                task_id=dag_id + "_data_snr_" + batch_file_path_list[0],
                name="data-prep-snr",
                cmds=[
                    "python",
                    "invocation_script.py",
                    "-b",
                    bucket_name,
                    "-a",
                    "audio_processing",
                    "-rc",
                    f"data/audiotospeech/config/config.yaml",
                    "-fl",
                    ",".join(batch_file_path_list),
                    "-af",
                    args.get("audio_format"),
                    "-as",
                    dag_id,
                    "-l",
                    language,
                ],
                namespace=composer_namespace,
                startup_timeout_seconds=300,
                secrets=[secret_file],
                image=
                f"us.gcr.io/ekstepspeechrecognition/ekstep_data_pipelines:{env_name}_1.0.0",
                image_pull_policy="Always",
                resources=resource_limits,
            )

            get_file_path_from_gcp_bucket >> data_prep_task >> data_prep_cataloguer

    return dag
    # no `config_file` parameter is specified. By default it will contain the
    # credentials for Cloud Composer's Google Kubernetes Engine cluster that is
    # created upon environment creation.

    # [START composer_kubernetespodoperator_minconfig]
    kubernetes_min_pod = kubernetes_pod_operator.KubernetesPodOperator(
        # The ID specified for the task.
        task_id='pod-ex-minimum',
        # Name of task you want to run, used to generate Pod ID.
        name='pod-ex-minimum',
        # Entrypoint of the container, if not specified the Docker container's
        # entrypoint is used. The cmds parameter is templated.
        cmds=['echo'],
        # The namespace to run within Kubernetes, default namespace is
        # `default`. There is the potential for the resource starvation of
        # Airflow workers and scheduler within the Cloud Composer environment,
        # the recommended solution is to increase the amount of nodes in order
        # to satisfy the computing requirements. Alternatively, launching pods
        # into a custom namespace will stop fighting over resources.
        namespace='default',
        # Docker image specified. Defaults to hub.docker.com, but any fully
        # qualified URLs will point to a custom repository. Supports private
        # gcr.io images if the Composer Environment is under the same
        # project-id as the gcr.io images and the service account that Composer
        # uses has permission to access the Google Container Registry
        # (the default service account has permission)
        image='gcr.io/gcp-runtimes/ubuntu_18_0_4')
    # [END composer_kubernetespodoperator_minconfig]
    # [START composer_kubernetespodoperator_templateconfig]
    kubenetes_template_ex = kubernetes_pod_operator.KubernetesPodOperator(
        task_id='ex-kube-templates',
        name='ex-kube-templates',
    dag_id="kube-pod-operator-cluster",
    start_date=airflow.utils.dates.days_ago(2),
    schedule_interval="@daily",
)

start_kube_process = BashOperator(
    task_id="start_kube_process",
    bash_command="echo upload to s3",
    dag=dag,
)

in_cluster = True
kubernetes_min_pod = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='pod-ex-minimum',
    name='pod-ex-minimum',
    cmds=['echo'],
    namespace='default',
    image='ubuntu:latest',
    in_cluster=in_cluster,
    executor_config={"LocalExecutor": {}})

run_another_pod = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='run-another-pod',
    name='run-another-pod',
    cmds=['echo'],
    namespace='default',
    image='ubuntu:latest',
    in_cluster=in_cluster,
    executor_config={"LocalExecutor": {}})

start_kube_process >> kubernetes_min_pod >> run_another_pod
# If a Pod fails to launch, or has an error occur in the container, Airflow
# will show the task as failed, as well as contain all of the task logs
# required to debug.
with models.DAG(
        dag_id="data_prep_cataloguer_pipeline",
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_args,
        start_date=YESTERDAY,
) as dag:
    kubernetes_list_bucket_pod = kubernetes_pod_operator.KubernetesPodOperator(
        task_id="data-normalizer",
        name="data-normalizer",
        cmds=[
            "python",
            "invocation_script.py",
            "-b",
            bucket_name,
            "-a",
            "audio_cataloguer",
            "-rc",
            f"data/audiotospeech/config/config.yaml",
        ],
        namespace=composer_namespace,
        startup_timeout_seconds=300,
        secrets=[secret_file],
        image=
        f"us.gcr.io/ekstepspeechrecognition/ekstep_data_pipelines:{env_name}_1.0.0",
        image_pull_policy="Always",
    )
    # created upon environment creation.

    # [START composer_kubernetespodoperator_minconfig]
    kubernetes_min_pod = kubernetes_pod_operator.KubernetesPodOperator(
        # The ID specified for the task.
        task_id='pod-ex-minimum',
        # Name of task you want to run, used to generate Pod ID.
        name='pod-ex-minimum',
        # Entrypoint of the container, if not specified the Docker container's
        # entrypoint is used. The cmds parameter is templated.
        cmds=["bash", "-cx"],
        arguments=["echo", "10"],
        # The namespace to run within Kubernetes, default namespace is
        # `default`. There is the potential for the resource starvation of
        # Airflow workers and scheduler within the Cloud Composer environment,
        # the recommended solution is to increase the amount of nodes in order
        # to satisfy the computing requirements. Alternatively, launching pods
        # into a custom namespace will stop fighting over resources.
        namespace='default',
        # Docker image specified. Defaults to hub.docker.com, but any fully
        # qualified URLs will point to a custom repository. Supports private
        # gcr.io images if the Composer Environment is under the same
        # project-id as the gcr.io images and the service account that Composer
        # uses has permission to access the Google Container Registry
        # (the default service account has permission)
        is_delete_operator_pod=True,
        image_pull_policy="Always",
        get_logs=True,
        image='ubuntu:16.04')
    # [END composer_kubernetespodoperator_minconfig]

# [END composer_kubernetespodoperator]
    bash_command="echo upload to s3",
    dag=dag,
)

kube_files_volume_config = {'name': 'minikube'}

kube_files_volume = k8s.V1Volume(name='kube-files-volume',
                                 config_map=kube_files_volume_config)
kube_files_volume_mount = k8s.V1VolumeMount(name='kube-files-volume',
                                            mount_path='/tmp/k8s')
in_cluster = False
kubernetes_min_pod = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='pod-ex-minimum',
    name='pod-ex-minimum',
    cmds=['echo'],
    namespace='default',
    image='ubuntu:latest',
    in_cluster=in_cluster,
    config_file='/tmp/k8s/minikube/config',
    volumes=[kube_files_volume],
    volume_mounts=[kube_files_volume_mount])

run_another_pod = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='run-another-pod',
    name='run-another-pod',
    cmds=['echo'],
    namespace='default',
    image='ubuntu:latest',
    in_cluster=in_cluster,
    config_file='/tmp/k8s/minikube/config')

start_kube_process >> kubernetes_min_pod >> run_another_pod
    "name": "EOD_DATE",
    "value": "{{ ds }}"
}]

k8s_spark_launcher = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='k8s-spark-launcher',
    name='k8s-spark-launcher',
    cmds=['/opt/spark/bin/spark-submit'],
    arguments=[
        '--master', 'k8s://https://kubernetes.default.svc.cluster.local:443',
        '--deploy-mode', 'cluster', '--name', 'hello', '--class', 'demo.Count',
        '--conf', 'spark.executor.instances=2', '--conf',
        'spark.kubernetes.executor.request.cores=0.4', '--conf',
        'spark.kubernetes.driver.request.cores=0.2', '--conf',
        'spark.kubernetes.container.image=skhatri/spark-k8s-hello:1.0.7',
        '--conf', 'spark.jars.ivy=/tmp/.ivy', '--conf', 'spark.app.name=hello',
        '--conf',
        'spark.kubernetes.authenticate.driver.serviceAccountName=job-trigger-sa',
        '--conf', 'spark.authenticate=false',
        'local:///tmp/jars/spark-k8s-hello.jar'
    ],
    namespace='default',
    image='skhatri/spark:v3.0.1',
    in_cluster=in_cluster,
    env_vars=spark_env_vars,
    service_account_name='job-trigger-sa',
    executor_config={"LocalExecutor": {}})

k8s_pod = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='k8s-pod',
    name='k8s-pod',
    "retry_delay": timedelta(minutes=5)
}

slack_token = BaseHook.get_connection("slack_conn").password

with DAG(dag_id="not_so_dangerous_dag",
         schedule_interval="@daily",
         start_date=datetime(2020, 11, 1),
         default_args=default_args,
         catchup=True) as dag:
    tasks = [
        kubernetes_pod_operator.KubernetesPodOperator(
            # The ID specified for the task.
            task_id=f'kubernetes_task_{i}',
            # Name of task you want to run, used to generate Pod ID.
            name=f'kubernetes_task_{i}',
            image='busybox',
            cmds=['sh', '-c', 'echo "Hello, Kubernetes!" && sleep 30'],
            in_cluster=True,
            is_delete_operator_pod=True,
            namespace='default') for i in range(10)
    ]

    sending_slack_notification = SlackWebhookOperator(
        task_id='sending_slack',
        http_conn_id='slack_conn',
        webhook_token=slack_token,
        message="Esta todo bien desde Kubernetes! \n Ahora toma un gatito! "
        "https://www.youtube.com/watch?v=J---aiyznGQ",
        username='******',
        icon_url='https://raw.githubusercontent.com/apache/'
        'airflow/master/airflow/www/static/pin_100.png',
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators import kubernetes_pod_operator
from airflow.utils.dates import days_ago

with models.DAG(
    "example_pod_operator_xcom",
    schedule_interval=None,  # Override to match your needs
    start_date=days_ago(1),
    tags=['example'],
) as dag:

    # [START howto_operator_gke_start_pod_xcom]
    pod_task_xcom = kubernetes_pod_operator.KubernetesPodOperator(
        task_id="pod_task_xcom",
        do_xcom_push=True,
        namespace="default",
        image="alpine",
        cmds=["sh", "-c", 'mkdir -p /airflow/xcom/;echo \'[1,2,3,4]\' > /airflow/xcom/return.json'],
        name="test-pod-xcom",
    )
    # [END howto_operator_gke_start_pod_xcom]

    # [START howto_operator_gke_xcom_result]
    pod_task_xcom_result = BashOperator(
        bash_command="echo \"{{ task_instance.xcom_pull('pod_task_xcom')[0] }}\"",
        task_id="pod_task_xcom_result",
    )
    # [END howto_operator_gke_xcom_result]


    pod_task_xcom >> pod_task_xcom_result
Exemple #12
0
with DAG(dag_name,
         default_args=default_args,
         schedule_interval=datetime.timedelta(days=1)) as dag:

    start = BashOperator(
        task_id='start',
        bash_command='date',
    )

    alphavantage_path = dag_name + '/alphavantage.json'
    alphavantage = kubernetes_pod_operator.KubernetesPodOperator(
        task_id='alphavantage',
        name='alphavantage',
        namespace=namespace,
        image=image,
        arguments=[
            "from pipeline_abstractions import *; AlphaVantageData(BUCKET_PATH_PUSH='{}')"
            .format(alphavantage_path)
        ],
        startup_timeout_seconds=900,
        is_delete_operator_pod=True)

    fred_path = dag_name + '/fred.json'
    fred = kubernetes_pod_operator.KubernetesPodOperator(
        task_id='fred',
        name='fred',
        namespace=namespace,
        image=image,
        arguments=[
            "from pipeline_abstractions import *; FRED(BUCKET_PATH_PUSH='{}')".
            format(fred_path)
Exemple #13
0
# -*- coding: utf-8 -*-
import datetime

from airflow import models
from airflow.contrib.kubernetes import secret
from airflow.contrib.operators import kubernetes_pod_operator
from utils.kubernetes import Tolerations, Affinity

YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1)

with models.DAG(dag_id='composer_kubernetes_pod_simple',
                schedule_interval=None,
                start_date=YESTERDAY) as dag:
    kubernetes_min_pod = kubernetes_pod_operator.KubernetesPodOperator(
        task_id='pod-workshop-simple',
        name='pod-workshop-simple',
        cmds=['echo', '"Hello world"'],
        namespace='default',
        resources={
            'request_memory': '128Mi',
            'request_cpu': '500m',
            'limit_memory': '500Mi',
            'limit_cpu': 1
        },
        image='gcr.io/gcp-runtimes/ubuntu_18_0_4',
        tolerations=Tolerations.default,
        affinity=Affinity.memory_heavy)
Exemple #14
0
from airflow import DAG
from datetime import datetime, timedelta
from airflow.contrib.operators import kubernetes_pod_operator

default_args = {
    'owner': 'Damavis',
    'start_date': datetime(2020, 5, 5),
    'retries': 1,
    'retry_delay': timedelta(seconds=5)
}

with DAG('etl_dag',
         default_args=default_args,
         schedule_interval=None) as dag:

    extract_tranform = kubernetes_pod_operator.KubernetesPodOperator(
        namespace='airflow',
        image="python:3.7-slim",
        cmds=["echo"],
        arguments=["This can be the extract part of an ETL"],
        labels={"foo": "bar"},
        name="extract-tranform",
        task_id="extract-tranform",
        get_logs=True
    )

    extract_tranform
Exemple #15
0
configuring_init_container = k8s.V1Container(
    name='secret-resolver',
    image='ubuntu:latest',
    command=['sh', '-c', 'echo {"user": "******", "password": "******"}|tee /tmp/out/hello'],
    volume_mounts=[
            data_write_secrets_mount
    ],
)

data_read_for_secrets_mount=k8s.V1VolumeMount(name='data-shared-dir', mount_path='/tmp/secrets')

k8s_pod_with_init = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='k8s-pod-with-init',
    name='k8s-pod',
    cmds=['sh', '-c', 'cat /tmp/secrets/hello'],
    namespace='default',
    image='ubuntu:latest',
    in_cluster=in_cluster,
    config_file='/tmp/k8s/minikube/config',
    init_containers=[configuring_init_container],
    volumes=[data_share_volume],
    volume_mounts=[
        data_read_for_secrets_mount
    ]    
)



k8s_init_pod_dag_start >> k8s_pod_with_init >> k8s_init_pod_dag_finish

Exemple #16
0
kubernetes_full_pod = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='ex-all-configs',
    name='pi',
    namespace='airflow',
    image='perl',
    # Entrypoint of the container, if not specified the Docker container's
    # entrypoint is used. The cmds parameter is templated.
    cmds=['perl'],
    # Arguments to the entrypoint. The docker image's CMD is used if this
    # is not provided. The arguments parameter is templated.
    arguments=['-Mbignum=bpi', '-wle', 'print bpi(2000)'],
    # The secrets to pass to Pod, the Pod will fail to create if the
    # secrets you specify in a Secret object do not exist in Kubernetes.
    secrets=[],
    # Labels to apply to the Pod.
    labels={'pod-label': 'label-name'},
    # Timeout to start up the Pod, default is 120.
    startup_timeout_seconds=120,
    # The environment variables to be initialized in the container
    # env_vars are templated.
    env_vars={'EXAMPLE_VAR': '/example/value'},
    # If true, logs stdout output of container. Defaults to True.
    get_logs=True,
    # Determines when to pull a fresh image, if 'IfNotPresent' will cause
    # the Kubelet to skip pulling an image if it already exists. If you
    # want to always pull a new image, set it to 'Always'.
    image_pull_policy='Always',
    # Annotations are non-identifying metadata you can attach to the Pod.
    # Can be a large range of data, and can include characters that are not
    # permitted by labels.
    annotations={'key1': 'value1'},
    # Resource specifications for Pod, this will allow you to set both cpu
    # and memory limits and requirements.
    #resources=pod.Resources(),
    # Specifies path to kubernetes config. If no config is specified will
    # default to '~/.kube/config'. The config_file is templated.
    #config_file='/home/airflow/composer_kube_config',
    # If true, the content of /airflow/xcom/return.json from container will
    # also be pushed to an XCom when the container ends.
    xcom_push=False,
    in_cluster=True,
    # List of Volume objects to pass to the Pod.
    # List of VolumeMount objects to pass to the Pod.
    # Affinity determines which nodes the Pod can run on based on the
    # config. For more information see:
    # https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
    dag=dag)
Exemple #17
0
from airflow import models
from airflow.contrib.operators import kubernetes_pod_operator
from airflow.operators import dummy_operator
from airflow.contrib.kubernetes.volume import Volume
from airflow.contrib.kubernetes.volume_mount import VolumeMount

YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1)

volume_mount = VolumeMount('test-volume',
                           mount_path='/root/mount_file',
                           sub_path=None,
                           read_only=True)
volume_config = {'persistentVolumeClaim': {'claimName': 'test-volume'}}
volume = Volume(name='test-volume', configs=volume_config)

with models.DAG(dag_id='airflow-demo',
                schedule_interval=datetime.timedelta(days=1),
                start_date=YESTERDAY) as dag:

    task1 = kubernetes_pod_operator.KubernetesPodOperator(
        task_id='t1',
        name='task1',
        namespace='airflow',
        image='eu.gcr.io/taiyo-239217/dag:fae4887',
        arguments=["AlphaVantage()"],
        volume=[],
        volume_mounts=[],
        in_cluster=True,
        xcom_push=True,
        is_delete_operator_pod=True)
Exemple #18
0
)

t6 = BashOperator(
    task_id='t6',
    depends_on_past=False,
    bash_command='echo start t6',
    dag=dag,
)

t7 = BashOperator(
    task_id='t7',
    depends_on_past=False,
    bash_command='echo start t7',
    dag=dag,
)

t8 = kubernetes_pod_operator.KubernetesPodOperator(
    task_id="t8",
    name="podtest",
    cmds=['echo','hello'],
    namespace='default',
    in_cluster=True,
    image='gcr.io/gcp-runtimes/ubuntu_18_0_4',
    dag=dag,
)

t1 >> [t2, t3]
t2 >> t4
t3 >> [t5, t6]
[t6, t4] >> t7
[t5, t7] >> t8
Exemple #19
0
from airflow import models
from airflow.contrib.kubernetes import secret
from airflow.contrib.operators import kubernetes_pod_operator
from airflow.contrib.kubernetes.pod import Resources

YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1)

with models.DAG(dag_id='demo_3',
                schedule_interval=datetime.timedelta(days=1),
                start_date=YESTERDAY) as dag:

    meetup_munge = kubernetes_pod_operator.KubernetesPodOperator(
        task_id='meetupmunge',
        name='meetupmuge',
        cmds=['python', 'munge.py'],
        namespace='default',
        image='brandonwatts/rvade-meetupmunge:latest',
        image_pull_policy='Always',
        xcom_push=True,
        get_logs=True)

    pscreatewebsite = kubernetes_pod_operator.KubernetesPodOperator(
        task_id='pscreatewebsite',
        name='pscreatewebsite',
        cmds=[
            'pwsh', './create_site.ps1',
            "{{ task_instance.xcom_pull(task_ids='meetupmunge', key='return_value')}}"
        ],
        namespace='default',
        image='brandonwatts/rvade-pscreatewebsite:latest',
        image_pull_policy='Always',
YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1)

default_args = {
    'owner': 'Trigger',
    'depends_on_past': False,
    'email': [''],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
}

# If a Pod fails to launch, or has an error occur in the container, Airflow
# will show the task as failed, as well as contain all of the task logs
# required to debug.
with models.DAG(
        dag_id='k8s_pod_operator_grid',
        default_args=default_args,
        #schedule_interval=datetime.timedelta(days=1),
        start_date=YESTERDAY,
        schedule_interval=None,
) as dag:
    kubernetes_min_pod = kubernetes_pod_operator.KubernetesPodOperator(
        task_id='pod-grid',
        name='pod-grid',
        namespace='default',
        image='rodriguesflavio/poc-pubsub-ok4')
    
    
    
        
Exemple #21
0
import datetime
from airflow import models
from airflow.contrib.operators import kubernetes_pod_operator
from airflow.operators import dummy_operator

YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1)

with models.DAG(dag_id='demo',
                schedule_interval=datetime.timedelta(days=1),
                start_date=YESTERDAY) as dag:

    task1 = kubernetes_pod_operator.KubernetesPodOperator(
        task_id='t1',
        name='task1',
        namespace='airflow',
        image='eu.gcr.io/taiyo-239217/dag:fae4886',
        arguments=["AlphaVantage()"],
        in_cluster=True,
        xcom_push=True,
        is_delete_operator_pod=True)

    task2 = kubernetes_pod_operator.KubernetesPodOperator(
        task_id='t2',
        name='task2',
        namespace='airflow',
        image='eu.gcr.io/taiyo-239217/dag:fae4886',
        arguments=["FRED()"],
        in_cluster=True,
        xcom_push=True,
        is_delete_operator_pod=True)
Exemple #22
0
import datetime
from airflow import models
from airflow.contrib.operators import kubernetes_pod_operator
from airflow.contrib.kubernetes.volume import Volume
from airflow.contrib.kubernetes.volume_mount import VolumeMount

YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1)
dag = models.DAG(dag_id='k8s-pod-ex-minimum',
                 schedule_interval=datetime.timedelta(days=1),
                 start_date=YESTERDAY)

kubernetes_min_pod = kubernetes_pod_operator.KubernetesPodOperator(
    # The ID specified for the task.
    task_id='pod-ex-minimum',
    # Name of task you want to run, used to generate Pod ID.
    dag=dag,
    name='pod-ex-minimum',
    cmds=['echo'],
    config_file="/usr/local/airflow/etc/kube.config",
    namespace='airflow',
    image='ubuntu:16.04')
Exemple #23
0
        dag_id='composer_sample_kubernetes_pod',
        schedule_interval=datetime.timedelta(days=1),
        start_date=YESTERDAY) as dag:
    # Only name, namespace, image, and task_id are required to create a
    # KubernetesPodOperator. In Cloud Composer, currently the operator defaults
    # to using the config file found at `/home/airflow/composer_kube_config if
    # no `config_file` parameter is specified. By default it will contain the
    # credentials for Cloud Composer's Google Kubernetes Engine cluster that is
    # created upon environment creation.

    kubernetes_min_pod = kubernetes_pod_operator.KubernetesPodOperator(
        # The ID specified for the task.
        task_id='pod-ex-minimum',
        # Name of task you want to run, used to generate Pod ID.
        name='pod-ex-minimum',
        # Entrypoint of the container, if not specified the Docker container's
        # entrypoint is used. The cmds parameter is templated.
        cmds=['echo'],
        # The namespace to run within Kubernetes, default namespace is
        # `default`. There is the potential for the resource starvation of
        # Airflow workers and scheduler within the Cloud Composer environment,
        # the recommended solution is to increase the amount of nodes in order
        # to satisfy the computing requirements. Alternatively, launching pods
        # into a custom namespace will stop fighting over resources.
        namespace='default',
        # Docker image specified. Defaults to hub.docker.com, but any fully
        # qualified URLs will point to a custom repository. Supports private
        # gcr.io images if the Composer Environment is under the same
        # project-id as the gcr.io images.
        image='gcr.io/gcp-runtimes/ubuntu_16_0_4')
    source_2_task = BashOperator(
        bash_command="echo \"Pretending to be source 2\"; sleep 20",
        task_id="source_2",
    )
    processing_tasks = []
    for i in range(1, 11):
        processing_tasks.append(
            kubernetes_pod_operator.KubernetesPodOperator(
                task_id='3Gi-task_{}'.format(i),
                name='3Gi-task-{}'.format(i),
                cmds=[
                    "sh", "-c",
                    'echo \'Sleeping..\'; sleep 120; echo \'Done!\''
                ],
                namespace='default',
                resources={
                    'request_memory': '3Gi',
                    'request_cpu': '200m',
                    'limit_memory': '3Gi',
                    'limit_cpu': 1
                },
                image='gcr.io/gcp-runtimes/ubuntu_18_0_4',
                tolerations=Tolerations.default,
                affinity=Affinity.memory_heavy,
                startup_timeout_seconds=300))

    start_task >> source_1_task >> start_processing
    start_task >> source_2_task >> start_processing
    for task in processing_tasks:
        start_processing >> task >> end_task
Exemple #25
0
def create_dag(dag_name, config_path):
    global default_args
    dag = DAG(dag_name, default_args=default_args, schedule_interval=None)
    with dag:

        start = BashOperator(
            task_id='start',
            bash_command='date',
        )

        alphavantage_path = dag_name + '/alphavantage.json'
        alphavantage = kubernetes_pod_operator.KubernetesPodOperator(
            task_id='alphavantage',
            name='alphavantage',
            namespace=namespace,
            image=image,
            arguments=[
                "from pipeline_abstractions import *; AlphaVantageData(CONFIG_PATH='{}', BUCKET_PATH_PUSH='{}')"
                .format(config_path, alphavantage_path)
            ],
            startup_timeout_seconds=900,
            is_delete_operator_pod=True)

        fred_path = dag_name + '/fred.json'
        fred = kubernetes_pod_operator.KubernetesPodOperator(
            task_id='fred',
            name='fred',
            namespace=namespace,
            image=image,
            arguments=[
                "from pipeline_abstractions import *; FRED(CONFIG_PATH='{}', BUCKET_PATH_PUSH='{}')"
                .format(config_path, fred_path)
            ],
            startup_timeout_seconds=900,
            is_delete_operator_pod=True)

        techfeatures_path = dag_name + '/techfeatures.json'
        techfeatures = kubernetes_pod_operator.KubernetesPodOperator(
            task_id='techfeatures',
            name='techfeatures',
            namespace=namespace,
            image=image,
            arguments=[
                "from pipeline_abstractions import *; TechFeatures(CONFIG_PATH='{}', BUCKET_PATH_PUSH='{}', BUCKET_PATH_PULL='{}')"
                .format(config_path, techfeatures_path, alphavantage_path)
            ],
            startup_timeout_seconds=900,
            is_delete_operator_pod=True)

        dataaggregation_path = dag_name + '/dataaggregation.json'
        dataaggregation = kubernetes_pod_operator.KubernetesPodOperator(
            task_id='dataaggregation',
            name='dataaggregation',
            namespace=namespace,
            image=image,
            arguments=[
                "from pipeline_abstractions import *; DataAggregation(CONFIG_PATH='{}', BUCKET_PATH_PUSH='{}', BUCKET_PATH_PULL_1='{}', BUCKET_PATH_PULL_2='{}')"
                .format(config_path, dataaggregation_path, techfeatures_path,
                        fred_path)
            ],
            startup_timeout_seconds=900,
            is_delete_operator_pod=True)

        inference_path = dag_name + '/inference.json'
        inference = kubernetes_pod_operator.KubernetesPodOperator(
            task_id='inference',
            name='inference',
            namespace=namespace,
            image=image,
            arguments=[
                "from pipeline_abstractions import *; Inference(CONFIG_PATH='{}', BUCKET_PATH_PUSH='{}', BUCKET_PATH_PULL='{}')"
                .format(config_path, inference_path, dataaggregation_path)
            ],
            startup_timeout_seconds=900,
            is_delete_operator_pod=True)

        uncertainitybounds_path = dag_name + '/uncertainitybounds.json'
        uncertainitybounds = kubernetes_pod_operator.KubernetesPodOperator(
            task_id='uncertainitybounds',
            name='uncertainitybounds',
            namespace=namespace,
            image=image,
            arguments=[
                "from pipeline_abstractions import *; UncertainityBounds(CONFIG_PATH='{}', BUCKET_PATH_PUSH='{}', BUCKET_PATH_PULL='{}')"
                .format(config_path, uncertainitybounds_path, inference_path)
            ],
            startup_timeout_seconds=900,
            is_delete_operator_pod=True)

        resultsgen_path = dag_name + '/resultsgen.json'
        resultsgen = kubernetes_pod_operator.KubernetesPodOperator(
            task_id='resultsgen',
            name='resultsgen',
            namespace=namespace,
            image=image,
            arguments=[
                "from pipeline_abstractions import *; ResultsGen(CONFIG_PATH='{}', BUCKET_PATH_PUSH='{}', BUCKET_PATH_PULL='{}')"
                .format(config_path, resultsgen_path, inference_path)
            ],
            startup_timeout_seconds=900,
            is_delete_operator_pod=True)

        tradecards_path = dag_name + '/tradecards.json'
        tradecards = kubernetes_pod_operator.KubernetesPodOperator(
            task_id='tradecards',
            name='tradecards',
            namespace=namespace,
            image=image,
            arguments=[
                "from pipeline_abstractions import *; TradeCards(CONFIG_PATH='{}', BUCKET_PATH_PUSH='{}', BUCKET_PATH_PULL_1='{}', BUCKET_PATH_PULL_2='{}', RunTime='{}')"
                .format(config_path, tradecards_path, uncertainitybounds_path,
                        resultsgen_path, run_time)
            ],
            startup_timeout_seconds=900,
            is_delete_operator_pod=True)

        mrm_path = dag_name + '/mrm.json'
        mrm = kubernetes_pod_operator.KubernetesPodOperator(
            task_id='mrm',
            name='mrm',
            namespace=namespace,
            image=image,
            arguments=[
                "from pipeline_abstractions import *; MRM(CONFIG_PATH='{}', BUCKET_PATH_PUSH='{}', BUCKET_PATH_PULL='{}', RunTime='{}')"
                .format(config_path, mrm_path, resultsgen_path, run_time)
            ],
            startup_timeout_seconds=900,
            is_delete_operator_pod=True)

        simtables_path = dag_name + '/simtables.json'
        dirtables_path = dag_name + '/dirtables.json'
        clsmatrix_path = dag_name + '/clsmatrix.json'
        simtables = kubernetes_pod_operator.KubernetesPodOperator(
            task_id='simtables',
            name='simtables',
            namespace=namespace,
            image=image,
            arguments=[
                "from pipeline_abstractions import *; SimTables(CONFIG_PATH='{}', BUCKET_PATH_PUSH_1='{}', BUCKET_PATH_PUSH_2='{}', BUCKET_PATH_PUSH_3='{}', BUCKET_PATH_PULL='{}', RunTime='{}')"
                .format(config_path, simtables_path, dirtables_path,
                        clsmatrix_path, resultsgen_path, run_time)
            ],
            startup_timeout_seconds=900,
            is_delete_operator_pod=True)

        mts_path = dag_name + '/mts.json'
        mts = kubernetes_pod_operator.KubernetesPodOperator(
            task_id='mts',
            name='mts',
            namespace=namespace,
            image=image,
            arguments=[
                "from pipeline_abstractions import *; MTS(CONFIG_PATH='{}', BUCKET_PATH_PUSH='{}', BUCKET_PATH_PULL='{}', RunTime='{}')"
                .format(config_path, mts_path, resultsgen_path, run_time)
            ],
            startup_timeout_seconds=900,
            is_delete_operator_pod=True)

        publishpostgress = kubernetes_pod_operator.KubernetesPodOperator(
            task_id='publishpostgress',
            name='publishpostgress',
            namespace=namespace,
            image=image,
            arguments=[
                "from pipeline_abstractions import *; PublishPostgress()"
            ],
            startup_timeout_seconds=900,
            is_delete_operator_pod=True)

        end = BashOperator(
            task_id='end',
            bash_command='date',
        )

        start >> [fred, alphavantage]
        techfeatures << [fred, alphavantage]
        dataaggregation << techfeatures
        inference << dataaggregation
        [uncertainitybounds, resultsgen] << inference
        tradecards << [uncertainitybounds, resultsgen]
        mrm << resultsgen
        simtables << resultsgen
        mts << resultsgen
        publishpostgress << [tradecards, mrm, simtables, mts]
        end << publishpostgress

    return dag
Exemple #26
0
import datetime
from airflow import models
from airflow.contrib.operators import kubernetes_pod_operator

YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1)
dag = models.DAG(dag_id='a_pod-ex-minimum',
                 schedule_interval=datetime.timedelta(days=1),
                 start_date=YESTERDAY)

kubernetes_min_pod = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='pod-ex-minimum',
    dag=dag,
    name='pod-ex-minimum',
    cmds=['echo', 'hello'],
    #config_file="/var/airflow/secrets/kubeconfig/kube.config",
    namespace='airflow',
    image='ubuntu:16.04')
Exemple #27
0
    # setattr(pod_res, 'limit_cpu', None)
    # An instance of an operator is called a task. In this case, the
    # hello_python task calls the "greeting" Python function.
    scale_down = kubernetes_pod_operator.KubernetesPodOperator(
        # The ID specified for the task.
        task_id='node-scale_down',
        # Name of task you want to run, used to generate Pod ID.
        name='scale-down',
        # resources=pod_res,
        # Entrypoint of the container, if not specified the Docker container's
        # entrypoint is used. The cmds parameter is templated.
        cmds=["echo", "I am here to scale down"],
        resources=pod_res,
        # The namespace to run within Kubernetes, default namespace is
        # `default`. There is the potential for the resource starvation of
        # Airflow workers and scheduler within the Cloud Composer environment,
        # the recommended solution is to increase the amount of nodes in order
        # to satisfy the computing requirements. Alternatively, launching pods
        # into a custom namespace will stop fighting over resources.
        namespace='bs4-app',
        is_delete_operator_pod=True,
        affinity=affinity_values,
        config_file='/home/airflow/composer_kube_config',
        # Docker image specified. Defaults to hub.docker.com, but any fully
        # qualified URLs will point to a custom repository. Supports private
        # gcr.io images if the Composer Environment is under the same
        # project-id as the gcr.io images.
        image='alpine:latest')

    scale_down
Exemple #28
0
    "email_on_retry": False,
    "email": "*****@*****.**",
    "retries": 1,
    "retry_delay": timedelta(minutes=5)
}

slack_token = BaseHook.get_connection("slack_conn").password

with DAG(dag_id="kubernetes_dag", schedule_interval="*/5 * * * *",
         default_args=default_args, catchup=False) as dag:
    say_hello_to_my_little_friend = kubernetes_pod_operator.KubernetesPodOperator(
        # The ID specified for the task.
        task_id='kubernetes_task',
        # Name of task you want to run, used to generate Pod ID.
        name='kubernetes_task',
        image='busybox',
        is_delete_operator_pod=True,
        cmds=['sh', '-c', 'echo "Hello, Kubernetes!" && sleep 30'],
        in_cluster=True,
        namespace='default'
    )

    sending_slack_notification = SlackWebhookOperator(
        task_id='sending_slack',
        http_conn_id='slack_conn',
        webhook_token=slack_token,
        message="Esta todo bien desde Kubernetes! \n Ahora toma un gatito! "
                "https://www.youtube.com/watch?v=J---aiyznGQ",
        username='******',
        icon_url='https://raw.githubusercontent.com/apache/'
                 'airflow/master/airflow/www/static/pin_100.png',
def build_kubernetes_pod_operator(operator_ref, dag_ref):
    """
    Builds a DAG operator of type: KubernetesPodOperator.
    Args:
        operator_ref (string): the definition of the operator
        dag_ref (string): the reference to the dag to associate this operator
    """
    op = kubernetes_pod_operator.KubernetesPodOperator(
        task_id=operator_ref['task_id'],
        name=operator_ref['name'],
        image=operator_ref['image'],
        namespace=operator_ref['namespace']
        if 'namespace' in operator_ref else 'default',
        dag=dag_ref)

    # populate non-default operator values
    if 'cmds' in operator_ref:
        op.cmds = operator_ref['cmds']

    if 'arguments' in operator_ref:
        op.arguments = operator_ref['arguments']

    if 'env_vars' in operator_ref:
        op.env_vars = operator_ref['env_vars']

    if 'labels' in operator_ref:
        op.env_vars = operator_ref['labels']

    if 'startup_timeout_seconds' in operator_ref:
        op.startup_timeout_seconds = operator_ref['startup_timeout_seconds']

    if 'ports' in operator_ref:
        op.ports = operator_ref['ports']

    if 'params' in operator_ref:
        op.params = operator_ref['params']

    if 'node_selectors' in operator_ref:
        op.node_selectors = operator_ref['node_selectors']

    if 'resources' in operator_ref:
        op.resources = operator_ref['resources']

    if 'config_file' in operator_ref:
        op.config_file = operator_ref['config_file']

    if 'annotations' in operator_ref:
        op.annotations = operator_ref['annotations']

    if 'volumes' in operator_ref:
        op.volumes = operator_ref['volumes']

    if 'volume_mounts' in operator_ref:
        op.volumes = operator_ref['volume_mounts']

    if 'affinity' in operator_ref:
        op.affinity = operator_ref['affinity']

    if 'configmaps' in operator_ref:
        op.configmaps = operator_ref['configmaps']

    # define pod secrets
    pod_secrets = []
    if 'pod_secret_refs' in operator_ref:
        for pod_secret in operator_ref['pod_secret_refs']:
            if not list(find_key_in_dict('kubernetes_secrets', payload)):
                raise ValueError(
                    f"Pod {operator_ref['name']} declares 'pod_secret_refs' but 'kubernetes_secrets' has not been defined."
                )

            secret_entry_ref = payload['kubernetes_secrets'][pod_secret]
            secret_entry = secret.Secret(
                # Deploy type: 'env' for environment  variable or 'volume'
                deploy_type=secret_entry_ref['deploy_type'],
                # The name of the environment variable or the path of the volume
                deploy_target=secret_entry_ref['deploy_target'],
                # Name of the Kubernetes Secret
                secret=secret_entry_ref['secret'],
                # Key of a secret stored in this Secret object or key in the form of service account file name
                key=secret_entry_ref['key'])
            pod_secrets.append(secret_entry)

        op.secrets = pod_secrets

        if 'image_pull_policy' in operator_ref:
            op.image_pull_policy = operator_ref['image_pull_policy']

        # define pull secrets
        image_pull_secrets = []
        if 'image_pull_secret_refs' in operator_ref:
            for image_pull_secret in operator_ref['image_pull_secret_refs']:
                if not list(find_key_in_dict('kubernetes_secrets', payload)):
                    raise ValueError(
                        f"Pod {operator_ref['name']} declares 'image_pull_secret_refs' but 'kubernetes_secrets' has not been defined."
                    )

                secret_entry_ref = payload['kubernetes_secrets'][
                    image_pull_secret]
                secret_entry = secret.Secret(
                    # Deploy type: 'env' for environment  variable or 'volume'
                    deploy_type=secret_entry_ref['deploy_type'],
                    # The name of the environment variable or the path of the volume
                    deploy_target=secret_entry_ref['deploy_target'],
                    # Name of the Kubernetes Secret
                    secret=secret_entry_ref['secret'],
                    # Key of a secret stored in this Secret object or key in the form of service account file name
                    key=secret_entry_ref['key'])
                image_pull_secrets.append(secret_entry)

            op.image_pull_secrets = image_pull_secrets

    return operator
default_args = {'start_date': YESTERDAY}

dag = airflow.DAG('simple_workflow_dag',
                  default_args=default_args,
                  schedule_interval=None)

bash_operator_task = bash_operator.BashOperator(
    task_id='bash_operator_example_task',
    bash_command='echo "Hello from Airflow Bash Operator"',
    dag=dag)


def python_operator_func():
    print("Hello from Airflow Python Operator")


python_operator_task = python_operator.PythonOperator(
    task_id='python_operator_example_task',
    python_callable=python_operator_func,
    dag=dag)

kubernetes_pod_operator_task = kubernetes_pod_operator.KubernetesPodOperator(
    task_id='k8s_pod_operator_example_task',
    name='k8s_pod_example',
    namespace='default',
    image='bash',
    cmds=['echo'],
    arguments=['"Hello from Airflow Kubernetes Pod Operator"'],
    dag=dag)