Exemple #1
0
    'start_date': datetime(2019, 5, 24),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=15),
}

cluster_name = 'gke-prio-a'

gcp_conn_id = "google_cloud_prio_a"
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

cluster_def = create_gke_config(
    name=cluster_name,
    service_account=
    "*****@*****.**",
    owner_label="hwoo",
    team_label="dataops")

gke_dag = DAG('gke_prio_a',
              default_args=DEFAULT_ARGS,
              dagrun_timeout=timedelta(hours=6),
              schedule_interval='@weekly')

create_gke_cluster = GKEClusterCreateOperator(task_id='create_gke_cluster',
                                              project_id=connection.project_id,
                                              location='us-west1-b',
                                              gcp_conn_id=gcp_conn_id,
                                              body=cluster_def,
                                              dag=gke_dag)
def container_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    gcp_conn_id,
    service_account,
    server_id,
    env_vars={},
    arguments=[],
    machine_type="n1-standard-1",
    image="mozilla/prio-processor:latest",
    location="us-west1-b",
    owner_label="amiyaguchi",
    team_label="dataeng",
):
    """Run a command on an ephemeral container running the
    `mozilla/prio-processor:latest` image.

    :param str parent_dag_name:         Name of the parent DAG.
    :param str child_dag_name:          Name of the child DAG.
    :param Dict[str, Any] default_args: Default arguments for the child DAG.
    :param str gcp_conn_id:             Name of the connection string.
    :param str service_account:         The address of the service account.
    :param str server_id:               The identifier for the Prio processor
    :param Dict[str, str] env_vars:     Environment variables for configuring
                                        the processor.
    :param List[str] arguments:         The command to run after loading the
                                        image.
    :param str machine_type:            The machine type for running the image.
    :param str image:                   Dockerhub image
    :param str location:                The region of the GKE cluster.
    :param str owner_label:             Label for associating the owner
    :param str team_label:              Label for associating the team
    :return: DAG
    """
    assert server_id in ["a", "b", "admin"]

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    cluster_name = "gke-prio-{}".format(server_id)

    shared_config = {
        "project_id": connection.project_id,
        "gcp_conn_id": gcp_conn_id,
        "location": location,
    }

    with DAG("{}.{}".format(parent_dag_name, child_dag_name),
             default_args=default_args) as dag:
        create_gke_cluster = GKEClusterCreateOperator(
            task_id="create_gke_cluster",
            body=create_gke_config(
                name=cluster_name,
                service_account=service_account,
                owner_label=owner_label,
                team_label=team_label,
                machine_type=machine_type,
                # DataProc clusters require VPC with auto-created subnets
                subnetwork="default" if server_id == "admin" else "gke-subnet",
                is_dev=environ.get("DEPLOY_ENVIRONMENT") == "dev",
            ),
            dag=dag,
            **shared_config)

        # Running the pod without any time in-between will cause the scope-based
        # authentication in Google Cloud Platform to fail. For example:
        #
        # `ServiceException: 401 Anonymous caller does not have
        # storage.objects.get access to moz-fx-prio-dev-a-private/processed/`
        #
        # Sleeping by a small amount solves this problem. This issue was first
        # noticed intermittently on 2019-09-09.
        sleep = BashOperator(task_id="sleep", bash_command="sleep 60", dag=dag)

        run_prio = GKEPodOperator(task_id="processor_{}".format(server_id),
                                  name="run-prio-project-{}".format(server_id),
                                  cluster_name=cluster_name,
                                  namespace="default",
                                  image=image,
                                  arguments=arguments,
                                  env_vars=env_vars,
                                  dag=dag,
                                  **shared_config)

        delete_gke_cluster = GKEClusterDeleteOperator(
            task_id="delete_gke_cluster",
            name=cluster_name,
            trigger_rule="all_done",
            dag=dag,
            **shared_config)

        create_gke_cluster >> sleep >> run_prio >> delete_gke_cluster
        return dag
Exemple #3
0
def container_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    gcp_conn_id,
    service_account,
    server_id,
    env_vars={},
    arguments=[],
    machine_type="n1-standard-1",
    image="mozilla/prio-processor:v3.0.1",
    location="us-west1-a",
    owner_label="amiyaguchi",
    team_label="dataeng",
    **kwargs,
):
    """Run a command on an ephemeral container running the
    `mozilla/prio-processor:latest` image.

    :param str parent_dag_name:         Name of the parent DAG.
    :param str child_dag_name:          Name of the child DAG.
    :param Dict[str, Any] default_args: Default arguments for the child DAG.
    :param str gcp_conn_id:             Name of the connection string.
    :param str service_account:         The address of the service account.
    :param str server_id:               The identifier for the Prio processor
    :param Dict[str, str] env_vars:     Environment variables for configuring
                                        the processor.
    :param List[str] arguments:         The command to run after loading the
                                        image.
    :param str machine_type:            The machine type for running the image.
    :param str image:                   Dockerhub image
    :param str location:                The region of the GKE cluster.
    :param str owner_label:             Label for associating the owner
    :param str team_label:              Label for associating the team
    :return: DAG
    """
    assert server_id in ["a", "b", "admin"]

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    cluster_name = f"gke-prio-{server_id}"

    shared_config = {
        "project_id": connection.project_id,
        "gcp_conn_id": gcp_conn_id,
        "location": location,
    }

    with DAG(f"{parent_dag_name}.{child_dag_name}", default_args=default_args) as dag:
        # https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator#kubernetespodoperator_configuration
        # https://medium.com/google-cloud/scale-your-kubernetes-cluster-to-almost-zero-with-gke-autoscaler-9c78051cbf40
        # https://docs.openshift.com/container-platform/3.6/admin_guide/scheduling/pod_affinity.html
        # https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/
        # https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator
        # https://airflow.apache.org/docs/stable/_api/airflow/contrib/operators/kubernetes_pod_operator/index.html
        create_gke_cluster = GKEClusterCreateOperator(
            task_id="create_gke_cluster",
            body=create_gke_config(
                name=cluster_name,
                service_account=service_account,
                owner_label=owner_label,
                team_label=team_label,
                machine_type=machine_type,
                location=location,
                # DataProc clusters require VPC with auto-created subnets
                subnetwork="default" if server_id == "admin" else "gke-subnet",
                is_dev=environ.get("DEPLOY_ENVIRONMENT") == "dev",
            ),
            dag=dag,
            **shared_config,
        )

        # Running the pod without any time in-between will cause the scope-based
        # authentication in Google Cloud Platform to fail. For example:
        #
        # `ServiceException: 401 Anonymous caller does not have
        # storage.objects.get access to moz-fx-prio-dev-a-private/processed/`
        #
        # Sleeping by a small amount solves this problem. This issue was first
        # noticed intermittently on 2019-09-09.
        sleep = BashOperator(task_id="sleep", bash_command="sleep 60", dag=dag)

        run_prio = GKEPodOperator(
            task_id=f"processor_{server_id}",
            name=f"processor_{server_id}",
            cluster_name=cluster_name,
            namespace="default",
            image=image,
            arguments=arguments,
            env_vars=env_vars,
            dag=dag,
            # choose the autoscaling node-pool for any jobs
            node_selectors={"node-label": "burstable"},
            labels={"pod-label": "burstable-pod"},
            affinity={
                "podAntiAffinity": {
                    "requiredDuringSchedulingIgnoredDuringExecution": [
                        {
                            "labelSelector": {
                                "matchExpressions": [
                                    {
                                        "key": "pod-label",
                                        "operator": "In",
                                        "values": ["burstable-pod"],
                                    }
                                ]
                            },
                            "topologyKey": "kubernetes.io/hostname",
                        }
                    ]
                }
            },
            # tolerate the tainted node
            tolerations=[
                {
                    "key": "reserved-pool",
                    "operator": "Equal",
                    "value": "true",
                    "effect": "NoSchedule",
                }
            ],
            # A new VM instance may take more than 120 seconds to boot
            startup_timeout_seconds=240,
            # delete the pod after running
            is_delete_operator_pod=True,
            **shared_config,
            **kwargs,
        )

        delete_gke_cluster = GKEClusterDeleteOperator(
            task_id="delete_gke_cluster",
            name=cluster_name,
            trigger_rule="all_done",
            dag=dag,
            **shared_config,
        )

        create_gke_cluster >> sleep >> run_prio >> delete_gke_cluster
        return dag