Exemple #1
0
    def execute(self, context):
        if "run_id" in self.params:
            self._run_id = self.params["run_id"]
        elif "dag_run" in context and context["dag_run"] is not None:
            self._run_id = context["dag_run"].run_id

        try:
            if self.instance:
                tags = {AIRFLOW_EXECUTION_DATE_STR: context.get("ts")} if "ts" in context else {}

                run = self.instance.register_managed_run(
                    pipeline_name=self.pipeline_name,
                    run_id=self.run_id,
                    run_config=self.run_config,
                    mode=self.mode,
                    solids_to_execute=None,
                    step_keys_to_execute=None,
                    tags=tags,
                    root_run_id=None,
                    parent_run_id=None,
                    pipeline_snapshot=self.pipeline_snapshot,
                    execution_plan_snapshot=self.execution_plan_snapshot,
                    parent_pipeline_snapshot=self.parent_pipeline_snapshot,
                )

            raw_res = self.execute_raw(context)
            self.log.info("Finished executing container.")

            res = parse_raw_log_lines(raw_res)

            try:
                handle_execution_errors(res, "executePlan")
            except DagsterGraphQLClientError as err:
                if self.instance:
                    self.instance.report_engine_event(
                        str(err),
                        run,
                        EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(sys.exc_info())
                        ),
                        self.__class__,
                    )
                raise

            events = handle_execute_plan_result_raw(res)

            if self.instance:
                for event in events:
                    self.instance.handle_new_event(event)

            events = [e.dagster_event for e in events]
            check_events_for_failures(events)
            check_events_for_skips(events)

            return events

        finally:
            self._run_id = None
Exemple #2
0
    def execute(self, context):
        try:
            from dagster_graphql.implementation.pipeline_execution_manager import (
                build_synthetic_pipeline_error_record,
            )
            from dagster_graphql.client.mutations import (
                DagsterGraphQLClientError,
                handle_execution_errors,
                handle_execute_plan_result_raw,
            )

        except ImportError:
            raise AirflowException(
                'To use the DagsterDockerOperator, dagster and dagster_graphql must be installed '
                'in your Airflow environment.'
            )

        if 'run_id' in self.params:
            self._run_id = self.params['run_id']
        elif 'dag_run' in context and context['dag_run'] is not None:
            self._run_id = context['dag_run'].run_id

        try:
            if self.instance:
                self.instance.get_or_create_run(
                    PipelineRun(
                        pipeline_name=self.pipeline_name,
                        run_id=self.run_id,
                        environment_dict=self.environment_dict,
                        mode=self.mode,
                        selector=ExecutionSelector(self.pipeline_name),
                        reexecution_config=None,
                        step_keys_to_execute=None,
                        tags=None,
                        status=PipelineRunStatus.MANAGED,
                    )
                )

            raw_res = super(DagsterDockerOperator, self).execute(context)
            self.log.info('Finished executing container.')

            res = parse_raw_res(raw_res)

            try:
                handle_execution_errors(res, 'executePlan')
            except DagsterGraphQLClientError:
                event = build_synthetic_pipeline_error_record(
                    self.run_id,
                    serializable_error_info_from_exc_info(sys.exc_info()),
                    self.pipeline_name,
                )
                if self.instance:
                    self.instance.handle_new_event(event)
                raise

            events = handle_execute_plan_result_raw(res)

            if self.instance:
                for event in events:
                    self.instance.handle_new_event(event)

            events = [e.dagster_event for e in events]
            check_events_for_failures(events)
            check_events_for_skips(events)

            return events

        finally:
            self._run_id = None
Exemple #3
0
    def _execute_step_k8s_job(
        _self,
        instance_ref_dict,
        step_keys,
        environment_dict,
        mode,
        pipeline_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        kubeconfig_file=None,
    ):
        '''Run step execution in a K8s job pod.
        '''
        from dagster_k8s.job import DagsterK8sJobConfig, construct_dagster_graphql_k8s_job
        from dagster_k8s.utils import get_pod_names_in_job, retrieve_pod_logs, wait_for_job_success

        import kubernetes

        check.dict_param(instance_ref_dict, 'instance_ref_dict')
        check.list_param(step_keys, 'step_keys', of_type=str)
        check.invariant(
            len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time'
        )
        check.dict_param(environment_dict, 'environment_dict')
        check.str_param(mode, 'mode')
        check.str_param(pipeline_name, 'pipeline_name')
        check.str_param(run_id, 'run_id')

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, 'job_config', DagsterK8sJobConfig)
        check.str_param(job_namespace, 'job_namespace')
        check.bool_param(load_incluster_config, 'load_incluster_config')
        check.opt_str_param(kubeconfig_file, 'kubeconfig_file')

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        step_keys_str = ", ".join(step_keys)

        # Ensure we stay below k8s name length limits
        k8s_name_key = _get_k8s_name_key(run_id, step_keys)
        job_name = 'dagster-stepjob-%s' % k8s_name_key
        pod_name = 'dagster-stepjob-%s' % k8s_name_key

        variables = construct_variables(mode, environment_dict, pipeline_name, run_id, step_keys)
        args = ['-p', 'executePlan', '-v', seven.json.dumps(variables)]

        job = construct_dagster_graphql_k8s_job(job_config, args, job_name, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            'Executing steps {} in Kubernetes job {}'.format(step_keys_str, job.metadata.name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, 'Step keys'),
                    EventMetadataEntry.text(job.metadata.name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'),
                    EventMetadataEntry.text(job_config.job_image, 'Job image'),
                    EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'),
                    EventMetadataEntry.text(
                        str(job_config.image_pull_secrets), 'Image pull secrets'
                    ),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name), 'Service account name'
                    ),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobEngine,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_keys[0],
        )
        events.append(engine_event)

        kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace)

        wait_for_job_success(job.metadata.name, namespace=job_namespace)
        pod_names = get_pod_names_in_job(job.metadata.name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            'Retrieving logs from Kubernetes Job pods',
            pipeline_run,
            EngineEventData([EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]),
            CeleryK8sJobEngine,
            step_key=step_keys[0],
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split('\n')

        res = parse_raw_log_lines(logs)

        handle_execution_errors(res, 'executePlan')
        step_events = handle_execute_plan_result(res)

        events += step_events

        serialized_events = [serialize_dagster_namedtuple(event) for event in events]
        return serialized_events
Exemple #4
0
    def _execute_step_docker(
        _self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        docker_config,
    ):
        '''Run step execution in a Docker container.
        '''
        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        step_keys_str = ", ".join(step_keys)

        variables = {
            'executionParams': {
                'runConfigData': run_config,
                'mode': mode,
                'selector': {
                    'repositoryLocationName': repo_location_name,
                    'repositoryName': repo_name,
                    'pipelineName': pipeline_run.pipeline_name,
                    'solidSelection': pipeline_run.solid_selection,
                },
                'executionMetadata': {
                    'runId': run_id
                },
                'stepKeys': step_keys,
            }
        }

        command = 'dagster-graphql -v \'{variables}\' -p executePlan'.format(
            variables=seven.json.dumps(variables))
        docker_image = docker_config['image']
        client = docker.client.from_env()

        if docker_config.get('registry'):
            client.login(
                registry=docker_config['registry']['url'],
                username=docker_config['registry']['username'],
                password=docker_config['registry']['password'],
            )

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            'Executing steps {} in Docker container {}'.format(
                step_keys_str, docker_image),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, 'Step keys'),
                    EventMetadataEntry.text(docker_image, 'Image'),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryDockerExecutor,
            step_key=step_keys[0],
        )

        events = [engine_event]

        docker_env = {}
        if docker_config.get('env_vars'):
            docker_env = {
                env_name: os.getenv(env_name)
                for env_name in docker_config['env_vars']
            }

        try:
            docker_response = client.containers.run(
                docker_image,
                command=command,
                detach=False,
                auto_remove=True,
                # pass through this worker's environment for things like AWS creds etc.
                environment=docker_env,
            )
            res = seven.json.loads(docker_response)

        except docker.errors.ContainerError as err:
            instance.report_engine_event(
                'Failed to run steps {} in Docker container {}'.format(
                    step_keys_str, docker_image),
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(docker_image, 'Job image'),
                    EventMetadataEntry.text(err.stderr, 'Docker stderr'),
                ], ),
                CeleryDockerExecutor,
                step_key=step_keys[0],
            )
            raise

        except JSONDecodeError:
            instance.report_engine_event(
                'Failed to parse response for steps {} from Docker container {}'
                .format(step_keys_str, docker_image),
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(docker_image, 'Job image'),
                    EventMetadataEntry.text(docker_response,
                                            'Docker Response'),
                ], ),
                CeleryDockerExecutor,
                step_key=step_keys[0],
            )
            raise

        else:
            handle_execution_errors(res, 'executePlan')
            step_events = handle_execute_plan_result(res)

        events += step_events

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
Exemple #5
0
    def execute(self, context):
        try:
            from dagster_graphql.client.mutations import (
                DagsterGraphQLClientError,
                handle_execution_errors,
                handle_execute_plan_result_raw,
            )

        except ImportError:
            raise AirflowException(
                'To use the DagsterDockerOperator, dagster and dagster_graphql must be installed '
                'in your Airflow environment.')

        if 'run_id' in self.params:
            self._run_id = self.params['run_id']
        elif 'dag_run' in context and context['dag_run'] is not None:
            self._run_id = context['dag_run'].run_id

        try:
            if self.instance:
                run = self.instance.register_managed_run(
                    pipeline_name=self.pipeline_name,
                    run_id=self.run_id,
                    environment_dict=self.environment_dict,
                    mode=self.mode,
                    solids_to_execute=None,
                    step_keys_to_execute=None,
                    tags=None,
                    root_run_id=None,
                    parent_run_id=None,
                    pipeline_snapshot=self.pipeline_snapshot,
                    execution_plan_snapshot=self.execution_plan_snapshot,
                    parent_pipeline_snapshot=self.parent_pipeline_snapshot,
                )

            raw_res = super(DagsterDockerOperator, self).execute(context)
            self.log.info('Finished executing container.')

            res = parse_raw_log_lines(raw_res)

            try:
                handle_execution_errors(res, 'executePlan')
            except DagsterGraphQLClientError as err:
                if self.instance:
                    self.instance.report_engine_event(
                        str(err),
                        run,
                        EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info())),
                        self.__class__,
                    )
                raise

            events = handle_execute_plan_result_raw(res)

            if self.instance:
                for event in events:
                    self.instance.handle_new_event(event)

            events = [e.dagster_event for e in events]
            check_events_for_failures(events)
            check_events_for_skips(events)

            return events

        finally:
            self._run_id = None
    def execute(self, context):
        try:
            from dagster_graphql.client.mutations import (
                DagsterGraphQLClientError,
                handle_execution_errors,
                handle_execute_plan_result_raw,
            )

        except ImportError:
            raise AirflowException(
                'To use the DagsterKubernetesPodOperator, dagster and dagster_graphql must be'
                ' installed in your Airflow environment.'
            )

        if 'run_id' in self.params:
            self._run_id = self.params['run_id']
        elif 'dag_run' in context and context['dag_run'] is not None:
            self._run_id = context['dag_run'].run_id

        # return to original execute code:
        try:
            client = kube_client.get_kube_client(
                in_cluster=self.in_cluster,
                cluster_context=self.cluster_context,
                config_file=self.config_file,
            )
            gen = pod_generator.PodGenerator()

            for mount in self.volume_mounts:
                gen.add_mount(mount)
            for volume in self.volumes:
                gen.add_volume(volume)

            pod = gen.make_pod(
                namespace=self.namespace,
                image=self.image,
                pod_id=self.name,
                cmds=self.cmds,
                arguments=self.query(context.get('ts')),
                labels=self.labels,
            )

            pod.service_account_name = self.service_account_name
            pod.secrets = self.secrets
            pod.envs = self.env_vars
            pod.image_pull_policy = self.image_pull_policy
            pod.image_pull_secrets = self.image_pull_secrets
            pod.annotations = self.annotations
            pod.resources = self.resources
            pod.affinity = self.affinity
            pod.node_selectors = self.node_selectors
            pod.hostnetwork = self.hostnetwork
            pod.tolerations = self.tolerations
            pod.configmaps = self.configmaps
            pod.security_context = self.security_context

            launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.xcom_push)
            try:
                if self.instance:
                    tags = (
                        {AIRFLOW_EXECUTION_DATE_STR: context.get('ts')} if 'ts' in context else {}
                    )

                    run = self.instance.register_managed_run(
                        pipeline_name=self.pipeline_name,
                        run_id=self.run_id,
                        run_config=self.run_config,
                        mode=self.mode,
                        solids_to_execute=None,
                        step_keys_to_execute=None,
                        tags=tags,
                        root_run_id=None,
                        parent_run_id=None,
                        pipeline_snapshot=self.pipeline_snapshot,
                        execution_plan_snapshot=self.execution_plan_snapshot,
                        parent_pipeline_snapshot=self.parent_pipeline_snapshot,
                    )

                # we won't use the "result", which is the pod's xcom json file
                (final_state, _) = launcher.run_pod(
                    pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs
                )

                # fetch the last line independently of whether logs were read
                # unbelievably, if you set tail_lines=1, the returned json has its double quotes
                # turned into unparseable single quotes
                res = None
                num_attempts = 0
                while not res and num_attempts < LOG_RETRIEVAL_MAX_ATTEMPTS:
                    raw_res = client.read_namespaced_pod_log(
                        name=pod.name, namespace=pod.namespace, container='base'
                    )
                    res = parse_raw_log_lines(raw_res.split('\n'))
                    time.sleep(LOG_RETRIEVAL_WAITS_BETWEEN_ATTEMPTS_SEC)
                    num_attempts += 1

                try:
                    handle_execution_errors(res, 'executePlan')
                except DagsterGraphQLClientError as err:
                    self.instance.report_engine_event(
                        str(err),
                        run,
                        EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(sys.exc_info())
                        ),
                        self.__class__,
                    )
                    raise

                events = handle_execute_plan_result_raw(res)

                if self.instance:
                    for event in events:
                        self.instance.handle_new_event(event)

                events = [e.dagster_event for e in events]
                check_events_for_failures(events)
                check_events_for_skips(events)
                return events

            finally:
                self._run_id = None

                if self.is_delete_operator_pod:
                    launcher.delete_pod(pod)

            if final_state != State.SUCCESS:
                raise AirflowException('Pod returned a failure: {state}'.format(state=final_state))
            # note the lack of returning the default xcom
        except AirflowException as ex:
            raise AirflowException('Pod Launching failed: {error}'.format(error=ex))
Exemple #7
0
    def execute(self, context):
        try:
            from dagster_graphql.implementation.pipeline_execution_manager import (
                build_synthetic_pipeline_error_record, )
            from dagster_graphql.client.mutations import (
                DagsterGraphQLClientError,
                handle_execution_errors,
                handle_execute_plan_result_raw,
            )

        except ImportError:
            raise AirflowException(
                'To use the DagsterKubernetesPodOperator, dagster and dagster_graphql must be'
                ' installed in your Airflow environment.')

        if 'run_id' in self.params:
            self._run_id = self.params['run_id']
        elif 'dag_run' in context and context['dag_run'] is not None:
            self._run_id = context['dag_run'].run_id

        # return to original execute code:
        try:
            client = kube_client.get_kube_client(
                in_cluster=self.in_cluster,
                cluster_context=self.cluster_context,
                config_file=self.config_file,
            )
            gen = pod_generator.PodGenerator()

            for mount in self.volume_mounts:
                gen.add_mount(mount)
            for volume in self.volumes:
                gen.add_volume(volume)

            pod = gen.make_pod(
                namespace=self.namespace,
                image=self.image,
                pod_id=self.name,
                cmds=self.cmds,
                arguments=self.query,
                labels=self.labels,
            )

            pod.service_account_name = self.service_account_name
            pod.secrets = self.secrets
            pod.envs = self.env_vars
            pod.image_pull_policy = self.image_pull_policy
            pod.image_pull_secrets = self.image_pull_secrets
            pod.annotations = self.annotations
            pod.resources = self.resources
            pod.affinity = self.affinity
            pod.node_selectors = self.node_selectors
            pod.hostnetwork = self.hostnetwork
            pod.tolerations = self.tolerations
            pod.configmaps = self.configmaps
            pod.security_context = self.security_context

            launcher = pod_launcher.PodLauncher(kube_client=client,
                                                extract_xcom=self.xcom_push)
            try:
                if self.instance:
                    self.instance.get_or_create_run(
                        PipelineRun(
                            pipeline_name=self.pipeline_name,
                            run_id=self.run_id,
                            environment_dict=self.environment_dict,
                            mode=self.mode,
                            selector=ExecutionSelector(self.pipeline_name),
                            reexecution_config=None,
                            step_keys_to_execute=None,
                            tags=None,
                            status=PipelineRunStatus.MANAGED,
                        ))

                # we won't use the "result", which is the pod's xcom json file
                (final_state, _) = launcher.run_pod(
                    pod,
                    startup_timeout=self.startup_timeout_seconds,
                    get_logs=self.get_logs)

                # fetch the last line independently of whether logs were read
                # unbelievably, if you set tail_lines=1, the returned json has its double quotes
                # turned into unparseable single quotes
                # TODO: add retries - k8s log servers are _extremely_ flaky
                raw_res = client.read_namespaced_pod_log(
                    name=pod.name,
                    namespace=pod.namespace,
                    container='base',
                    tail_lines=5)

                res = parse_raw_res(raw_res.split('\n'))

                try:
                    handle_execution_errors(res, 'executePlan')
                except DagsterGraphQLClientError:
                    event = build_synthetic_pipeline_error_record(
                        self.run_id,
                        serializable_error_info_from_exc_info(sys.exc_info()),
                        self.pipeline_name,
                    )
                    if self.instance:
                        self.instance.handle_new_event(event)
                    raise

                events = handle_execute_plan_result_raw(res)

                if self.instance:
                    for event in events:
                        self.instance.handle_new_event(event)

                check_raw_events_for_skips(events)

                return events

            finally:
                self._run_id = None

                if self.is_delete_operator_pod:
                    launcher.delete_pod(pod)

            if final_state != State.SUCCESS:
                raise AirflowException(
                    'Pod returned a failure: {state}'.format(
                        state=final_state))
            # note the lack of returning the default xcom
        except AirflowException as ex:
            raise AirflowException(
                'Pod Launching failed: {error}'.format(error=ex))
Exemple #8
0
    def _execute_step_docker(
        _self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        docker_config,
    ):
        """Run step execution in a Docker container.
        """
        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, "Could not load run {}".format(run_id))

        step_keys_str = ", ".join(step_keys)

        variables = {
            "executionParams": {
                "runConfigData": run_config,
                "mode": mode,
                "selector": {
                    "repositoryLocationName":
                    repo_location_name,
                    "repositoryName":
                    repo_name,
                    "pipelineName":
                    pipeline_run.pipeline_name,
                    "solidSelection":
                    list(pipeline_run.solids_to_execute)
                    if pipeline_run.solids_to_execute else None,
                },
                "executionMetadata": {
                    "runId": run_id
                },
                "stepKeys": step_keys,
            }
        }

        command = "dagster-graphql -v '{variables}' -p executePlan".format(
            variables=seven.json.dumps(variables))
        docker_image = docker_config["image"]
        client = docker.client.from_env()

        if docker_config.get("registry"):
            client.login(
                registry=docker_config["registry"]["url"],
                username=docker_config["registry"]["username"],
                password=docker_config["registry"]["password"],
            )

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            "Executing steps {} in Docker container {}".format(
                step_keys_str, docker_image),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, "Step keys"),
                    EventMetadataEntry.text(docker_image, "Image"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryDockerExecutor,
            step_key=step_keys[0],
        )

        events = [engine_event]

        docker_env = {}
        if docker_config.get("env_vars"):
            docker_env = {
                env_name: os.getenv(env_name)
                for env_name in docker_config["env_vars"]
            }

        try:
            docker_response = client.containers.run(
                docker_image,
                command=command,
                detach=False,
                auto_remove=True,
                # pass through this worker's environment for things like AWS creds etc.
                environment=docker_env,
            )
            res = seven.json.loads(docker_response)

        except docker.errors.ContainerError as err:
            instance.report_engine_event(
                "Failed to run steps {} in Docker container {}".format(
                    step_keys_str, docker_image),
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(docker_image, "Job image"),
                    EventMetadataEntry.text(err.stderr, "Docker stderr"),
                ], ),
                CeleryDockerExecutor,
                step_key=step_keys[0],
            )
            raise

        except JSONDecodeError:
            instance.report_engine_event(
                "Failed to parse response for steps {} from Docker container {}"
                .format(step_keys_str, docker_image),
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(docker_image, "Job image"),
                    EventMetadataEntry.text(docker_response,
                                            "Docker Response"),
                ], ),
                CeleryDockerExecutor,
                step_key=step_keys[0],
            )
            raise

        else:
            handle_execution_errors(res, "executePlan")
            step_events = handle_execute_plan_result(res)

        events += step_events

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
Exemple #9
0
    def _execute_step_k8s_job(
        _self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        retries_dict,
        resources=None,
        kubeconfig_file=None,
    ):
        '''Run step execution in a K8s job pod.
        '''

        check.dict_param(instance_ref_dict, 'instance_ref_dict')
        check.list_param(step_keys, 'step_keys', of_type=str)
        check.invariant(
            len(step_keys) == 1,
            'Celery K8s task executor can only execute 1 step at a time')
        check.dict_param(run_config, 'run_config')
        check.str_param(mode, 'mode')
        check.str_param(repo_name, 'repo_name')
        check.str_param(repo_location_name, 'repo_location_name')
        check.str_param(run_id, 'run_id')

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, 'job_config', DagsterK8sJobConfig)
        check.str_param(job_namespace, 'job_namespace')

        check.bool_param(load_incluster_config, 'load_incluster_config')
        check.dict_param(retries_dict, 'retries_dict')

        check.opt_dict_param(resources,
                             'resources',
                             key_type=str,
                             value_type=dict)
        check.opt_str_param(kubeconfig_file, 'kubeconfig_file')

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        step_key = step_keys[0]
        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                'Not scheduling step because pipeline run status is not STARTED',
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, 'Step keys'),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return

        # Ensure we stay below k8s name length limits
        k8s_name_key = _get_k8s_name_key(run_id, step_keys)

        retries = Retries.from_config(retries_dict)

        if retries.get_attempt_count(step_key):
            attempt_number = retries.get_attempt_count(step_key)
            job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number)
            pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number)
        else:
            job_name = 'dagster-job-%s' % (k8s_name_key)
            pod_name = 'dagster-job-%s' % (k8s_name_key)

        variables = {
            'executionParams': {
                'runConfigData': run_config,
                'mode': mode,
                'selector': {
                    'repositoryLocationName':
                    repo_location_name,
                    'repositoryName':
                    repo_name,
                    'pipelineName':
                    pipeline_run.pipeline_name,
                    'solidSelection':
                    list(pipeline_run.solids_to_execute)
                    if pipeline_run.solids_to_execute else None,
                },
                'executionMetadata': {
                    'runId': run_id
                },
                'stepKeys': step_keys,
            },
            'retries': retries.to_graphql_input(),
        }
        args = [
            '-p', 'executePlan', '-v',
            seven.json.dumps(variables), '--remap-sigterm'
        ]

        job = construct_dagster_graphql_k8s_job(job_config, args, job_name,
                                                resources, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            'Executing step {} in Kubernetes job {}'.format(
                step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_key, 'Step keys'),
                    EventMetadataEntry.text(job_name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'),
                    EventMetadataEntry.text(job_config.job_image, 'Job image'),
                    EventMetadataEntry.text(job_config.image_pull_policy,
                                            'Image pull policy'),
                    EventMetadataEntry.text(str(job_config.image_pull_secrets),
                                            'Image pull secrets'),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name),
                        'Service account name'),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)

        kubernetes.client.BatchV1Api().create_namespaced_job(
            body=job, namespace=job_namespace)

        try:
            wait_for_job_success(
                job_name=job_name,
                namespace=job_namespace,
                instance=instance,
                run_id=run_id,
            )
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                'Terminating Kubernetes Job because pipeline run status is not STARTED',
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, 'Step keys'),
                    EventMetadataEntry.text(job_name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(job_namespace,
                                            'Kubernetes Job namespace'),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return

        pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            'Retrieving logs from Kubernetes Job pods',
            pipeline_run,
            EngineEventData(
                [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split('\n')

        res = parse_raw_log_lines(logs)
        handle_execution_errors(res, 'executePlan')
        step_events = handle_execute_plan_result(res)

        events += step_events

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events