Example #1
0
    def _execute_plan(self, execute_step_args_packed, executable_dict):
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            ))
        check.inst_param(execute_step_args, "execute_step_args",
                         ExecuteStepArgs)

        check.dict_param(executable_dict, "executable_dict")

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)

        pipeline = ReconstructablePipeline.from_dict(executable_dict)
        retries = Retries.from_config(execute_step_args.retries_dict)

        pipeline_run = instance.get_run_by_id(
            execute_step_args.pipeline_run_id)
        check.invariant(
            pipeline_run,
            "Could not load run {}".format(execute_step_args.pipeline_run_id))

        step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)

        execution_plan = create_execution_plan(
            pipeline,
            pipeline_run.run_config,
            mode=pipeline_run.mode,
            step_keys_to_execute=execute_step_args.step_keys_to_execute,
        )

        engine_event = instance.report_engine_event(
            "Executing steps {} in celery worker".format(step_keys_str),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, "step_keys"),
                    EventMetadataEntry.text(self.request.hostname,
                                            "Celery worker"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryExecutor,
            step_key=execution_plan.step_key_for_single_step_plans(),
        )

        events = [engine_event]
        for step_event in execute_plan_iterator(
                execution_plan,
                pipeline_run=pipeline_run,
                run_config=pipeline_run.run_config,
                instance=instance,
                retries=retries,
        ):
            events.append(step_event)

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
def test_input_manager_with_failure():
    @root_input_manager
    def should_fail(_):
        raise Failure(
            description="Foolure",
            metadata_entries=[
                EventMetadataEntry.text(label="label", text="text", description="description")
            ],
        )

    @solid(input_defs=[InputDefinition("_fail_input", root_manager_key="should_fail")])
    def fail_on_input(_, _fail_input):
        assert False, "should not be called"

    @pipeline(mode_defs=[ModeDefinition(resource_defs={"should_fail": should_fail})])
    def simple():
        fail_on_input()

    with tempfile.TemporaryDirectory() as tmpdir_path:

        instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path))

        result = execute_pipeline(simple, instance=instance, raise_on_error=False)

        assert not result.success

        failure_data = result.result_for_solid("fail_on_input").failure_data

        assert failure_data.error.cls_name == "Failure"

        assert failure_data.user_failure_data.description == "Foolure"
        assert failure_data.user_failure_data.metadata_entries[0].label == "label"
        assert failure_data.user_failure_data.metadata_entries[0].entry_data.text == "text"
        assert failure_data.user_failure_data.metadata_entries[0].description == "description"
Example #3
0
    def run_daemon_loop(
        self,
        instance_ref,
        daemon_uuid,
        daemon_shutdown_event,
        gen_workspace,
        heartbeat_interval_seconds,
        error_interval_seconds,
        until=None,
    ):
        from dagster.core.telemetry_upload import uploading_logging_thread

        # Each loop runs in its own thread with its own instance and IWorkspace
        with DagsterInstance.from_ref(instance_ref) as instance:
            with uploading_logging_thread():
                with gen_workspace(instance) as workspace:
                    check.inst_param(workspace, "workspace", IWorkspace)

                    daemon_generator = self.core_loop(instance, workspace)

                    try:
                        while (not daemon_shutdown_event.is_set()) and (
                            not until or pendulum.now("UTC") < until
                        ):
                            try:
                                result = check.opt_inst(
                                    next(daemon_generator), SerializableErrorInfo
                                )
                                if result:
                                    self._errors.appendleft((result, pendulum.now("UTC")))
                            except StopIteration:
                                self._logger.error(
                                    "Daemon loop finished without raising an error - daemon loops should run forever until they are interrupted."
                                )
                                break
                            except Exception:
                                error_info = serializable_error_info_from_exc_info(sys.exc_info())
                                self._logger.error(
                                    "Caught error, daemon loop will restart:\n{}".format(error_info)
                                )
                                self._errors.appendleft((error_info, pendulum.now("UTC")))
                                daemon_generator.close()
                                daemon_generator = self.core_loop(instance, workspace)
                            finally:
                                try:
                                    self._check_add_heartbeat(
                                        instance,
                                        daemon_uuid,
                                        heartbeat_interval_seconds,
                                        error_interval_seconds,
                                    )
                                except Exception:
                                    self._logger.error(
                                        "Failed to add heartbeat: \n{}".format(
                                            serializable_error_info_from_exc_info(sys.exc_info())
                                        )
                                    )
                    finally:
                        # cleanup the generator if it was stopped part-way through
                        daemon_generator.close()
Example #4
0
def test_valid_log_level_instance_yaml():
    ref = InstanceRef.from_dir(
        base_dir=file_relative_path(__file__,
                                    "../../../docs_snippets/concepts/logging"),
        config_filename="python_logging_python_log_level_config.yaml",
    )
    instance = DagsterInstance.from_ref(ref)
    assert instance.python_log_level == "INFO"
Example #5
0
def test_valid_managed_loggers_instance_yaml():
    ref = InstanceRef.from_dir(
        base_dir=file_relative_path(__file__,
                                    "../../../docs_snippets/concepts/logging"),
        config_filename="python_logging_managed_loggers_config.yaml",
    )
    instance = DagsterInstance.from_ref(ref)
    assert instance.managed_python_loggers == ["root"]
 def _start_pipeline_execution(self, job_args):
     handle = job_args['handle']
     pipeline_run = job_args['pipeline_run']
     pipeline = handle.build_repository_definition().get_pipeline(
         pipeline_run.pipeline_name)
     instance = DagsterInstance.from_ref(job_args['instance_ref'])
     self._delegate.execute_pipeline(handle, pipeline, pipeline_run,
                                     instance)
Example #7
0
def test_valid_handler_instance_yaml():
    ref = InstanceRef.from_dir(
        base_dir=file_relative_path(__file__,
                                    "../../../docs_snippets/concepts/logging"),
        config_filename="python_logging_handler_config.yaml",
    )
    instance = DagsterInstance.from_ref(ref)
    assert len(instance.get_handlers()) == 2
Example #8
0
def test_output_manager_with_failure():
    _called_input_manager = False
    _called_solid = False

    @output_manager
    def should_fail(_, _obj):
        raise Failure(
            description="Foolure",
            metadata_entries=[
                EventMetadataEntry.text(label="label", text="text", description="description")
            ],
        )

    @input_manager
    def should_not_enter(_):
        _called_input_manager = True

    @solid(output_defs=[OutputDefinition(manager_key="should_fail")])
    def emit_str(_):
        return "emit"

    @solid(
        input_defs=[
            InputDefinition(name="_input_str", dagster_type=str, manager_key="should_not_enter")
        ]
    )
    def should_not_call(_, _input_str):
        _called_solid = True

    @pipeline(
        mode_defs=[
            ModeDefinition(
                resource_defs={"should_fail": should_fail, "should_not_enter": should_not_enter}
            )
        ]
    )
    def simple():
        should_not_call(emit_str())

    with tempfile.TemporaryDirectory() as tmpdir_path:

        instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path))

        result = execute_pipeline(simple, instance=instance, raise_on_error=False)

        assert not result.success

        failure_data = result.result_for_solid("emit_str").failure_data

        assert failure_data.error.cls_name == "Failure"

        assert failure_data.user_failure_data.description == "Foolure"
        assert failure_data.user_failure_data.metadata_entries[0].label == "label"
        assert failure_data.user_failure_data.metadata_entries[0].entry_data.text == "text"
        assert failure_data.user_failure_data.metadata_entries[0].description == "description"

        assert not _called_input_manager and not _called_solid
Example #9
0
    def _execute_plan(_self, instance_ref_dict, handle_dict, run_id, step_keys,
                      retries_dict):
        check.dict_param(instance_ref_dict, 'instance_ref_dict')
        check.dict_param(handle_dict, 'handle_dict')
        check.str_param(run_id, 'run_id')
        check.list_param(step_keys, 'step_keys', of_type=str)
        check.dict_param(retries_dict, 'retries_dict')

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        handle = ExecutionTargetHandle.from_dict(handle_dict)
        retries = Retries.from_config(retries_dict)

        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        pipeline_def = handle.build_pipeline_definition().build_sub_pipeline(
            pipeline_run.selector.solid_subset)

        step_keys_str = ", ".join(step_keys)

        execution_plan = create_execution_plan(
            pipeline_def,
            pipeline_run.environment_dict,
            mode=pipeline_run.mode,
            step_keys_to_execute=pipeline_run.step_keys_to_execute,
        ).build_subset_plan(step_keys)

        engine_event = instance.report_engine_event(
            'Executing steps {} in celery worker'.format(step_keys_str),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, 'step_keys'),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryEngine,
            step_key=execution_plan.step_key_for_single_step_plans(),
        )

        events = [engine_event]
        for step_event in execute_plan_iterator(
                execution_plan,
                pipeline_run=pipeline_run,
                environment_dict=pipeline_run.environment_dict,
                instance=instance,
                retries=retries,
        ):
            events.append(step_event)

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
Example #10
0
    def _execute_plan(_self, instance_ref_dict, executable_dict, run_id,
                      step_keys, retries_dict):
        check.dict_param(instance_ref_dict, "instance_ref_dict")
        check.dict_param(executable_dict, "executable_dict")
        check.str_param(run_id, "run_id")
        check.list_param(step_keys, "step_keys", of_type=str)
        check.dict_param(retries_dict, "retries_dict")

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline = ReconstructablePipeline.from_dict(executable_dict)
        retries = Retries.from_config(retries_dict)

        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, "Could not load run {}".format(run_id))

        step_keys_str = ", ".join(step_keys)

        execution_plan = create_execution_plan(
            pipeline,
            pipeline_run.run_config,
            mode=pipeline_run.mode,
            step_keys_to_execute=pipeline_run.step_keys_to_execute,
        ).build_subset_plan(step_keys)

        engine_event = instance.report_engine_event(
            "Executing steps {} in celery worker".format(step_keys_str),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, "step_keys"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryExecutor,
            step_key=execution_plan.step_key_for_single_step_plans(),
        )

        events = [engine_event]
        for step_event in execute_plan_iterator(
                execution_plan,
                pipeline_run=pipeline_run,
                run_config=pipeline_run.run_config,
                instance=instance,
                retries=retries,
        ):
            events.append(step_event)

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
Example #11
0
    def run_loop(
        self,
        instance_ref,
        daemon_uuid,
        daemon_shutdown_event,
        gen_workspace,
        heartbeat_interval_seconds,
        error_interval_seconds,
        until=None,
    ):
        # Each loop runs in its own thread with its own instance and IWorkspace
        with DagsterInstance.from_ref(instance_ref) as instance:
            with gen_workspace(instance) as workspace:
                check.inst_param(workspace, "workspace", IWorkspace)

                while not daemon_shutdown_event.is_set() and (
                        not until or pendulum.now("UTC") < until):
                    curr_time = pendulum.now("UTC")
                    if (not self._last_iteration_time
                            or (curr_time - self._last_iteration_time
                                ).total_seconds() >= self.interval_seconds):
                        self._last_iteration_time = curr_time
                        self._run_iteration(
                            instance,
                            daemon_uuid,
                            daemon_shutdown_event,
                            workspace,
                            heartbeat_interval_seconds,
                            error_interval_seconds,
                            until,
                        )

                    try:
                        self._check_add_heartbeat(
                            instance,
                            daemon_uuid,
                            heartbeat_interval_seconds,
                            error_interval_seconds,
                        )
                    except Exception:  # pylint: disable=broad-except
                        self._logger.error(
                            "Failed to add heartbeat: \n{}".format(
                                serializable_error_info_from_exc_info(
                                    sys.exc_info())))
                    daemon_shutdown_event.wait(0.5)
    def in_mp_process(cls, handle, pipeline_run, instance_ref, term_event):
        """
        Execute pipeline using message queue as a transport
        """
        run_id = pipeline_run.run_id
        pipeline_name = pipeline_run.pipeline_name

        instance = DagsterInstance.from_ref(instance_ref)
        pid = os.getpid()
        instance.report_engine_event(
            'Started process for pipeline (pid: {pid}).'.format(pid=pid),
            pipeline_run,
            EngineEventData.in_process(pid,
                                       marker_end='dagit_subprocess_init'),
            cls,
        )

        start_termination_thread(term_event)

        try:
            handle.build_repository_definition()
            pipeline_def = handle.with_pipeline_name(
                pipeline_name).build_pipeline_definition()
        except Exception:  # pylint: disable=broad-except
            instance.report_engine_event(
                'Failed attempting to load pipeline "{}"'.format(
                    pipeline_name),
                pipeline_run,
                EngineEventData.engine_error(
                    serializable_error_info_from_exc_info(sys.exc_info())),
                cls,
            )
            return

        try:
            event_list = []
            for event in execute_run_iterator(
                    pipeline_def.build_sub_pipeline(
                        pipeline_run.selector.solid_subset),
                    pipeline_run,
                    instance,
            ):
                event_list.append(event)
            return PipelineExecutionResult(pipeline_def, run_id, event_list,
                                           lambda: None)

        # Add a DagsterEvent for unexpected exceptions
        # Explicitly ignore KeyboardInterrupts since they are used for termination
        except DagsterSubprocessError as err:
            if not all([
                    err_info.cls_name == 'KeyboardInterrupt'
                    for err_info in err.subprocess_error_infos
            ]):
                instance.report_engine_event(
                    'An exception was thrown during execution that is likely a framework error, '
                    'rather than an error in user code.',
                    pipeline_run,
                    EngineEventData.engine_error(
                        serializable_error_info_from_exc_info(sys.exc_info())),
                    cls,
                )
        except Exception:  # pylint: disable=broad-except
            instance.report_engine_event(
                'An exception was thrown during execution that is likely a framework error, '
                'rather than an error in user code.',
                pipeline_run,
                EngineEventData.engine_error(
                    serializable_error_info_from_exc_info(sys.exc_info())),
                cls,
            )
        finally:
            instance.report_engine_event(
                'Process for pipeline exited (pid: {pid}).'.format(pid=pid),
                pipeline_run,
                cls=cls,
            )
Example #13
0
    def _execute_step_k8s_job(
        self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        retries_dict,
        pipeline_origin_packed,
        user_defined_k8s_config_dict=None,
        kubeconfig_file=None,
    ):
        """Run step execution in a K8s job pod.
        """

        check.dict_param(instance_ref_dict, "instance_ref_dict")
        check.list_param(step_keys, "step_keys", of_type=str)
        check.invariant(
            len(step_keys) == 1, "Celery K8s task executor can only execute 1 step at a time"
        )
        check.dict_param(run_config, "run_config")
        check.str_param(mode, "mode")
        check.str_param(repo_name, "repo_name")
        check.str_param(repo_location_name, "repo_location_name")
        check.str_param(run_id, "run_id")

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, "job_config", DagsterK8sJobConfig)
        check.str_param(job_namespace, "job_namespace")

        check.bool_param(load_incluster_config, "load_incluster_config")
        check.dict_param(retries_dict, "retries_dict")

        pipeline_origin = unpack_value(
            check.dict_param(
                pipeline_origin_packed, "pipeline_origin_packed"
            )  # TODO: make part of args
        )
        check.inst(pipeline_origin, PipelineOrigin)

        user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(
            user_defined_k8s_config_dict
        )
        check.opt_inst_param(
            user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig,
        )
        check.opt_str_param(kubeconfig_file, "kubeconfig_file")

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)

        check.invariant(pipeline_run, "Could not load run {}".format(run_id))
        step_key = step_keys[0]

        celery_worker_name = self.request.hostname
        celery_pod_name = os.environ.get("HOSTNAME")
        instance.report_engine_event(
            "Task for step {step_key} picked up by Celery".format(step_key=step_key),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(celery_worker_name, "Celery worker name"),
                    EventMetadataEntry.text(celery_pod_name, "Celery worker Kubernetes Pod name"),
                ]
            ),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )

        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                "Not scheduling step because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData([EventMetadataEntry.text(step_key, "Step keys"),]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return

        # Ensure we stay below k8s name length limits
        k8s_name_key = get_k8s_job_name(run_id, step_key)

        retries = Retries.from_config(retries_dict)

        if retries.get_attempt_count(step_key):
            attempt_number = retries.get_attempt_count(step_key)
            job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
            pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
        else:
            job_name = "dagster-job-%s" % (k8s_name_key)
            pod_name = "dagster-job-%s" % (k8s_name_key)

        input_json = serialize_dagster_namedtuple(
            ExecuteStepArgs(
                pipeline_origin=pipeline_origin,
                pipeline_run_id=run_id,
                instance_ref=None,
                mode=mode,
                step_keys_to_execute=step_keys,
                run_config=run_config,
                retries_dict=retries_dict,
            )
        )
        command = ["dagster"]
        args = ["api", "execute_step_with_structured_logs", input_json]

        job = construct_dagster_k8s_job(
            job_config, command, args, job_name, user_defined_k8s_config, pod_name
        )

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            "Executing step {} in Kubernetes job {}".format(step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_key, "Step keys"),
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(pod_name, "Kubernetes Pod name"),
                    EventMetadataEntry.text(job_config.job_image, "Job image"),
                    EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"),
                    EventMetadataEntry.text(
                        str(job_config.image_pull_secrets), "Image pull secrets"
                    ),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name), "Service account name"
                    ),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)

        try:
            kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            if e.reason == "Conflict":
                # There is an existing job with the same name so do not procede.
                instance.report_engine_event(
                    "Did not create Kubernetes job {} for step {} since job name already "
                    "exists, exiting.".format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step keys"),
                            EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                            EventMetadataEntry.text(pod_name, "Kubernetes Pod name"),
                        ],
                        marker_end=DELEGATE_MARKER,
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            else:
                instance.report_engine_event(
                    "Encountered unexpected error while creating Kubernetes job {} for step {}, "
                    "exiting.".format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step keys"),
                            EventMetadataEntry.text(e, "Error"),
                        ]
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            return

        try:
            wait_for_job_success(
                job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id,
            )
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                "Terminating Kubernetes Job because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData(
                    [
                        EventMetadataEntry.text(step_key, "Step keys"),
                        EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                        EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"),
                    ]
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return

        pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            "Retrieving logs from Kubernetes Job pods",
            pipeline_run,
            EngineEventData([EventMetadataEntry.text("\n".join(pod_names), "Pod names")]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split("\n")

        events += filter_dagster_events_from_pod_logs(logs)
        serialized_events = [serialize_dagster_namedtuple(event) for event in events]
        return serialized_events
Example #14
0
    def _execute_step_k8s_job(
        _self,
        instance_ref_dict,
        step_keys,
        environment_dict,
        mode,
        pipeline_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        kubeconfig_file=None,
    ):
        '''Run step execution in a K8s job pod.
        '''
        from dagster_k8s.job import DagsterK8sJobConfig, construct_dagster_graphql_k8s_job
        from dagster_k8s.utils import get_pod_names_in_job, retrieve_pod_logs, wait_for_job_success

        import kubernetes

        check.dict_param(instance_ref_dict, 'instance_ref_dict')
        check.list_param(step_keys, 'step_keys', of_type=str)
        check.invariant(
            len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time'
        )
        check.dict_param(environment_dict, 'environment_dict')
        check.str_param(mode, 'mode')
        check.str_param(pipeline_name, 'pipeline_name')
        check.str_param(run_id, 'run_id')

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, 'job_config', DagsterK8sJobConfig)
        check.str_param(job_namespace, 'job_namespace')
        check.bool_param(load_incluster_config, 'load_incluster_config')
        check.opt_str_param(kubeconfig_file, 'kubeconfig_file')

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        step_keys_str = ", ".join(step_keys)

        # Ensure we stay below k8s name length limits
        k8s_name_key = _get_k8s_name_key(run_id, step_keys)
        job_name = 'dagster-stepjob-%s' % k8s_name_key
        pod_name = 'dagster-stepjob-%s' % k8s_name_key

        variables = construct_variables(mode, environment_dict, pipeline_name, run_id, step_keys)
        args = ['-p', 'executePlan', '-v', seven.json.dumps(variables)]

        job = construct_dagster_graphql_k8s_job(job_config, args, job_name, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            'Executing steps {} in Kubernetes job {}'.format(step_keys_str, job.metadata.name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, 'Step keys'),
                    EventMetadataEntry.text(job.metadata.name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'),
                    EventMetadataEntry.text(job_config.job_image, 'Job image'),
                    EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'),
                    EventMetadataEntry.text(
                        str(job_config.image_pull_secrets), 'Image pull secrets'
                    ),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name), 'Service account name'
                    ),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobEngine,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_keys[0],
        )
        events.append(engine_event)

        kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace)

        wait_for_job_success(job.metadata.name, namespace=job_namespace)
        pod_names = get_pod_names_in_job(job.metadata.name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            'Retrieving logs from Kubernetes Job pods',
            pipeline_run,
            EngineEventData([EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]),
            CeleryK8sJobEngine,
            step_key=step_keys[0],
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split('\n')

        res = parse_raw_log_lines(logs)

        handle_execution_errors(res, 'executePlan')
        step_events = handle_execute_plan_result(res)

        events += step_events

        serialized_events = [serialize_dagster_namedtuple(event) for event in events]
        return serialized_events
Example #15
0
    def _execute_step_k8s_job(
        _self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        retries_dict,
        pipeline_origin_packed,
        resources=None,
        kubeconfig_file=None,
    ):
        '''Run step execution in a K8s job pod.
        '''

        check.dict_param(instance_ref_dict, 'instance_ref_dict')
        check.list_param(step_keys, 'step_keys', of_type=str)
        check.invariant(
            len(step_keys) == 1,
            'Celery K8s task executor can only execute 1 step at a time')
        check.dict_param(run_config, 'run_config')
        check.str_param(mode, 'mode')
        check.str_param(repo_name, 'repo_name')
        check.str_param(repo_location_name, 'repo_location_name')
        check.str_param(run_id, 'run_id')

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, 'job_config', DagsterK8sJobConfig)
        check.str_param(job_namespace, 'job_namespace')

        check.bool_param(load_incluster_config, 'load_incluster_config')
        check.dict_param(retries_dict, 'retries_dict')

        pipeline_origin = unpack_value(
            check.dict_param(
                pipeline_origin_packed,
                'pipeline_origin_packed')  # TODO: make part of args
        )
        check.inst(pipeline_origin, PipelineOrigin)

        check.opt_dict_param(resources,
                             'resources',
                             key_type=str,
                             value_type=dict)
        check.opt_str_param(kubeconfig_file, 'kubeconfig_file')

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)

        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        step_key = step_keys[0]
        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                'Not scheduling step because pipeline run status is not STARTED',
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, 'Step keys'),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return

        # Ensure we stay below k8s name length limits
        k8s_name_key = get_k8s_job_name(run_id, step_key)

        retries = Retries.from_config(retries_dict)

        if retries.get_attempt_count(step_key):
            attempt_number = retries.get_attempt_count(step_key)
            job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number)
            pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number)
        else:
            job_name = 'dagster-job-%s' % (k8s_name_key)
            pod_name = 'dagster-job-%s' % (k8s_name_key)

        input_json = serialize_dagster_namedtuple(
            ExecuteStepArgs(
                pipeline_origin=pipeline_origin,
                pipeline_run_id=run_id,
                instance_ref=None,
                mode=mode,
                step_keys_to_execute=step_keys,
                run_config=run_config,
                retries_dict=retries_dict,
            ))
        command = ['dagster']
        args = ['api', 'execute_step_with_structured_logs', input_json]

        job = construct_dagster_k8s_job(job_config, command, args, job_name,
                                        resources, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            'Executing step {} in Kubernetes job {}'.format(
                step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_key, 'Step keys'),
                    EventMetadataEntry.text(job_name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'),
                    EventMetadataEntry.text(job_config.job_image, 'Job image'),
                    EventMetadataEntry.text(job_config.image_pull_policy,
                                            'Image pull policy'),
                    EventMetadataEntry.text(str(job_config.image_pull_secrets),
                                            'Image pull secrets'),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name),
                        'Service account name'),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)

        try:
            kubernetes.client.BatchV1Api().create_namespaced_job(
                body=job, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            if e.reason == 'Conflict':
                # There is an existing job with the same name so do not procede.
                instance.report_engine_event(
                    'Did not create Kubernetes job {} for step {} since job name already '
                    'exists, exiting.'.format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, 'Step keys'),
                            EventMetadataEntry.text(job_name,
                                                    'Kubernetes Job name'),
                            EventMetadataEntry.text(pod_name,
                                                    'Kubernetes Pod name'),
                        ],
                        marker_end=DELEGATE_MARKER,
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            else:
                instance.report_engine_event(
                    'Encountered unexpected error while creating Kubernetes job {} for step {}, '
                    'exiting.'.format(job_name, step_key),
                    pipeline_run,
                    EngineEventData([
                        EventMetadataEntry.text(step_key, 'Step keys'),
                        EventMetadataEntry.text(e, 'Error'),
                    ]),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            return

        try:
            wait_for_job_success(
                job_name=job_name,
                namespace=job_namespace,
                instance=instance,
                run_id=run_id,
            )
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                'Terminating Kubernetes Job because pipeline run status is not STARTED',
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, 'Step keys'),
                    EventMetadataEntry.text(job_name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(job_namespace,
                                            'Kubernetes Job namespace'),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return

        pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            'Retrieving logs from Kubernetes Job pods',
            pipeline_run,
            EngineEventData(
                [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split('\n')

        events += filter_dagster_events_from_pod_logs(logs)
        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
Example #16
0
    def _execute_step_docker(
        self,
        execute_step_args_packed,
        docker_config,
    ):
        """Run step execution in a Docker container."""
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            ))
        check.inst_param(execute_step_args, "execute_step_args",
                         ExecuteStepArgs)

        check.dict_param(docker_config, "docker_config")

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)
        pipeline_run = instance.get_run_by_id(
            execute_step_args.pipeline_run_id)
        check.inst(
            pipeline_run,
            PipelineRun,
            "Could not load run {}".format(execute_step_args.pipeline_run_id),
        )
        step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)

        input_json = serialize_dagster_namedtuple(execute_step_args)

        command = "dagster api execute_step {}".format(json.dumps(input_json))

        docker_image = (docker_config["image"]
                        if docker_config.get("image") else execute_step_args.
                        pipeline_origin.repository_origin.container_image)

        if not docker_image:
            raise Exception(
                "No docker image specified by either the job or the repository"
            )

        client = docker.client.from_env()

        if docker_config.get("registry"):
            client.login(
                registry=docker_config["registry"]["url"],
                username=docker_config["registry"]["username"],
                password=docker_config["registry"]["password"],
            )

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            "Executing steps {} in Docker container {}".format(
                step_keys_str, docker_image),
            pipeline_run,
            EngineEventData(
                [
                    MetadataEntry.text(step_keys_str, "Step keys"),
                    MetadataEntry.text(docker_image, "Image"),
                    MetadataEntry.text(self.request.hostname, "Celery worker"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryDockerExecutor,
            step_key=execute_step_args.step_keys_to_execute[0],
        )

        serialized_events = [serialize_dagster_namedtuple(engine_event)]

        docker_env = {}
        if docker_config.get("env_vars"):
            docker_env = {
                env_name: os.getenv(env_name)
                for env_name in docker_config["env_vars"]
            }

        try:
            docker_response = client.containers.run(
                docker_image,
                command=command,
                detach=False,
                auto_remove=True,
                # pass through this worker's environment for things like AWS creds etc.
                environment=docker_env,
                network=docker_config.get("network", None),
            )

            res = docker_response.decode("utf-8")
        except docker.errors.ContainerError as err:
            instance.report_engine_event(
                "Failed to run steps {} in Docker container {}".format(
                    step_keys_str, docker_image),
                pipeline_run,
                EngineEventData([
                    MetadataEntry.text(docker_image, "Job image"),
                    MetadataEntry.text(err.stderr, "Docker stderr"),
                ], ),
                CeleryDockerExecutor,
                step_key=execute_step_args.step_keys_to_execute[0],
            )
            raise
        else:
            if res is None:
                raise Exception(
                    "No response from execute_step in CeleryDockerExecutor")

            serialized_events += [event for event in res.split("\n") if event]

        return serialized_events
Example #17
0
    def _execute_step_k8s_job(
        _self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        retries_dict,
        resources=None,
        kubeconfig_file=None,
    ):
        '''Run step execution in a K8s job pod.
        '''

        check.dict_param(instance_ref_dict, 'instance_ref_dict')
        check.list_param(step_keys, 'step_keys', of_type=str)
        check.invariant(
            len(step_keys) == 1,
            'Celery K8s task executor can only execute 1 step at a time')
        check.dict_param(run_config, 'run_config')
        check.str_param(mode, 'mode')
        check.str_param(repo_name, 'repo_name')
        check.str_param(repo_location_name, 'repo_location_name')
        check.str_param(run_id, 'run_id')

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, 'job_config', DagsterK8sJobConfig)
        check.str_param(job_namespace, 'job_namespace')

        check.bool_param(load_incluster_config, 'load_incluster_config')
        check.dict_param(retries_dict, 'retries_dict')

        check.opt_dict_param(resources,
                             'resources',
                             key_type=str,
                             value_type=dict)
        check.opt_str_param(kubeconfig_file, 'kubeconfig_file')

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        step_key = step_keys[0]
        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                'Not scheduling step because pipeline run status is not STARTED',
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, 'Step keys'),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return

        # Ensure we stay below k8s name length limits
        k8s_name_key = _get_k8s_name_key(run_id, step_keys)

        retries = Retries.from_config(retries_dict)

        if retries.get_attempt_count(step_key):
            attempt_number = retries.get_attempt_count(step_key)
            job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number)
            pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number)
        else:
            job_name = 'dagster-job-%s' % (k8s_name_key)
            pod_name = 'dagster-job-%s' % (k8s_name_key)

        variables = {
            'executionParams': {
                'runConfigData': run_config,
                'mode': mode,
                'selector': {
                    'repositoryLocationName':
                    repo_location_name,
                    'repositoryName':
                    repo_name,
                    'pipelineName':
                    pipeline_run.pipeline_name,
                    'solidSelection':
                    list(pipeline_run.solids_to_execute)
                    if pipeline_run.solids_to_execute else None,
                },
                'executionMetadata': {
                    'runId': run_id
                },
                'stepKeys': step_keys,
            },
            'retries': retries.to_graphql_input(),
        }
        args = [
            '-p', 'executePlan', '-v',
            seven.json.dumps(variables), '--remap-sigterm'
        ]

        job = construct_dagster_graphql_k8s_job(job_config, args, job_name,
                                                resources, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            'Executing step {} in Kubernetes job {}'.format(
                step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_key, 'Step keys'),
                    EventMetadataEntry.text(job_name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'),
                    EventMetadataEntry.text(job_config.job_image, 'Job image'),
                    EventMetadataEntry.text(job_config.image_pull_policy,
                                            'Image pull policy'),
                    EventMetadataEntry.text(str(job_config.image_pull_secrets),
                                            'Image pull secrets'),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name),
                        'Service account name'),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)

        kubernetes.client.BatchV1Api().create_namespaced_job(
            body=job, namespace=job_namespace)

        try:
            wait_for_job_success(
                job_name=job_name,
                namespace=job_namespace,
                instance=instance,
                run_id=run_id,
            )
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                'Terminating Kubernetes Job because pipeline run status is not STARTED',
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, 'Step keys'),
                    EventMetadataEntry.text(job_name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(job_namespace,
                                            'Kubernetes Job namespace'),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return

        pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            'Retrieving logs from Kubernetes Job pods',
            pipeline_run,
            EngineEventData(
                [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split('\n')

        res = parse_raw_log_lines(logs)
        handle_execution_errors(res, 'executePlan')
        step_events = handle_execute_plan_result(res)

        events += step_events

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
def test_output_manager_with_retries():
    _called = False
    _count = {"total": 0}

    @object_manager
    def should_succeed(_):
        class FakeObjectManager(ObjectManager):
            def load_input(self, _context):
                return "foo"

            def handle_output(self, _context, _obj):
                if _count["total"] < 2:
                    _count["total"] += 1
                    raise RetryRequested(max_retries=3)

        return FakeObjectManager()

    @object_manager
    def should_retry(_):
        class FakeObjectManager(ObjectManager):
            def load_input(self, _context):
                return "foo"

            def handle_output(self, _context, _obj):
                raise RetryRequested(max_retries=3)

        return FakeObjectManager()

    @pipeline(mode_defs=[
        ModeDefinition(resource_defs={
            "should_succeed": should_succeed,
            "should_retry": should_retry,
        })
    ])
    def simple():
        @solid(
            output_defs=[OutputDefinition(manager_key="should_succeed")], )
        def source_solid(_):
            return "foo"

        @solid(
            input_defs=[InputDefinition("solid_input")],
            output_defs=[OutputDefinition(manager_key="should_retry")],
        )
        def take_input(_, solid_input):
            return solid_input

        @solid(input_defs=[InputDefinition("_solid_input")])
        def should_not_execute(_, _solid_input):
            _called = True

        should_not_execute(take_input(source_solid()))

    with tempfile.TemporaryDirectory() as tmpdir_path:

        instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path))

        result = execute_pipeline(simple,
                                  instance=instance,
                                  raise_on_error=False)

        step_stats = instance.get_run_step_stats(result.run_id)
        assert len(step_stats) == 2

        step_stats_1 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["source_solid"])
        assert len(step_stats_1) == 1
        step_stat_1 = step_stats_1[0]
        assert step_stat_1.status.value == "SUCCESS"
        assert step_stat_1.attempts == 3

        step_stats_2 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["take_input"])
        assert len(step_stats_2) == 1
        step_stat_2 = step_stats_2[0]
        assert step_stat_2.status.value == "FAILURE"
        assert step_stat_2.attempts == 4

        step_stats_3 = instance.get_run_step_stats(
            result.run_id, step_keys=["should_not_execute"])
        assert len(step_stats_3) == 0
        assert _called == False
def test_input_manager_with_retries():
    _count = {"total": 0}

    @root_input_manager
    def should_succeed_after_retries(_):
        if _count["total"] < 2:
            _count["total"] += 1
            raise RetryRequested(max_retries=3)
        return "foo"

    @root_input_manager
    def should_retry(_):
        raise RetryRequested(max_retries=3)

    @solid(input_defs=[
        InputDefinition("solid_input",
                        root_manager_key="should_succeed_after_retries")
    ])
    def take_input_1(_, solid_input):
        return solid_input

    @solid(input_defs=[
        InputDefinition("solid_input", root_manager_key="should_retry")
    ])
    def take_input_2(_, solid_input):
        return solid_input

    @solid
    def take_input_3(_, _input1, _input2):
        assert False, "should not be called"

    @pipeline(mode_defs=[
        ModeDefinition(
            resource_defs={
                "should_succeed_after_retries": should_succeed_after_retries,
                "should_retry": should_retry,
            })
    ])
    def simple():
        take_input_3(take_input_2(), take_input_1())

    with tempfile.TemporaryDirectory() as tmpdir_path:

        instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path))

        result = execute_pipeline(simple,
                                  instance=instance,
                                  raise_on_error=False)

        step_stats = instance.get_run_step_stats(result.run_id)
        assert len(step_stats) == 2

        step_stats_1 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["take_input_1"])
        assert len(step_stats_1) == 1
        step_stat_1 = step_stats_1[0]
        assert step_stat_1.status.value == "SUCCESS"
        assert step_stat_1.attempts == 3

        step_stats_2 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["take_input_2"])
        assert len(step_stats_2) == 1
        step_stat_2 = step_stats_2[0]
        assert step_stat_2.status.value == "FAILURE"
        assert step_stat_2.attempts == 4

        step_stats_3 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["take_input_3"])
        assert len(step_stats_3) == 0
Example #20
0
 def _start_pipeline_execution(self, job_args):
     pipeline_dict = job_args['pipeline_dict']
     pipeline_run = job_args['pipeline_run']
     pipeline = InterProcessExecutablePipeline.from_dict(pipeline_dict)
     instance = DagsterInstance.from_ref(job_args['instance_ref'])
     self._delegate.execute_pipeline(pipeline, pipeline_run, instance)
Example #21
0
    def _execute_step_docker(
        self,
        execute_step_args_packed,
        docker_config,
    ):
        """Run step execution in a Docker container."""
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            ))
        check.inst_param(execute_step_args, "execute_step_args",
                         ExecuteStepArgs)

        check.dict_param(docker_config, "docker_config")

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)
        pipeline_run = instance.get_run_by_id(
            execute_step_args.pipeline_run_id)
        check.inst(
            pipeline_run,
            PipelineRun,
            "Could not load run {}".format(execute_step_args.pipeline_run_id),
        )
        step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)

        input_json = serialize_dagster_namedtuple(execute_step_args)

        command = "dagster api execute_step {}".format(json.dumps(input_json))

        docker_image = (docker_config["image"]
                        if docker_config.get("image") else execute_step_args.
                        pipeline_origin.repository_origin.container_image)

        if not docker_image:
            raise Exception(
                "No docker image specified by either the job or the repository"
            )

        client = docker.client.from_env()

        if docker_config.get("registry"):
            client.login(
                registry=docker_config["registry"]["url"],
                username=docker_config["registry"]["username"],
                password=docker_config["registry"]["password"],
            )

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            "Executing steps {} in Docker container {}".format(
                step_keys_str, docker_image),
            pipeline_run,
            EngineEventData(
                [
                    MetadataEntry("Step keys", value=step_keys_str),
                    MetadataEntry("Image", value=docker_image),
                    MetadataEntry("Celery worker",
                                  value=self.request.hostname),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryDockerExecutor,
            step_key=execute_step_args.step_keys_to_execute[0],
        )

        serialized_events = [serialize_dagster_namedtuple(engine_event)]

        docker_env = {}
        if docker_config.get("env_vars"):
            docker_env = {
                env_name: os.getenv(env_name)
                for env_name in docker_config["env_vars"]
            }

        container_kwargs = check.opt_dict_param(
            docker_config.get("container_kwargs"),
            "container_kwargs",
            key_type=str)

        # set defaults for detach and auto_remove
        container_kwargs["detach"] = container_kwargs.get("detach", False)
        container_kwargs["auto_remove"] = container_kwargs.get(
            "auto_remove", True)

        # if environment variables are provided via container_kwargs, merge with env_vars
        if container_kwargs.get("environment") is not None:
            e_vars = container_kwargs.get("environment")
            if isinstance(e_vars, dict):
                docker_env.update(e_vars)
            else:
                for v in e_vars:
                    key, val = v.split("=")
                    docker_env[key] = val
            del container_kwargs["environment"]

        try:
            docker_response = client.containers.run(
                docker_image,
                command=command,
                # pass through this worker's environment for things like AWS creds etc.
                environment=docker_env,
                network=docker_config.get("network", None),
                **container_kwargs,
            )

            res = docker_response.decode("utf-8")
        except docker.errors.ContainerError as err:
            entries = [MetadataEntry("Job image", value=docker_image)]
            if err.stderr is not None:
                entries.append(MetadataEntry("Docker stderr",
                                             value=err.stderr))

            instance.report_engine_event(
                "Failed to run steps {} in Docker container {}".format(
                    step_keys_str, docker_image),
                pipeline_run,
                EngineEventData(entries),
                CeleryDockerExecutor,
                step_key=execute_step_args.step_keys_to_execute[0],
            )
            raise
        else:
            if res is None:
                raise Exception(
                    "No response from execute_step in CeleryDockerExecutor")

            serialized_events += [event for event in res.split("\n") if event]

        return serialized_events
Example #22
0
    def _execute_step_docker(
        _self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        docker_config,
    ):
        """Run step execution in a Docker container.
        """
        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, "Could not load run {}".format(run_id))

        step_keys_str = ", ".join(step_keys)

        variables = {
            "executionParams": {
                "runConfigData": run_config,
                "mode": mode,
                "selector": {
                    "repositoryLocationName":
                    repo_location_name,
                    "repositoryName":
                    repo_name,
                    "pipelineName":
                    pipeline_run.pipeline_name,
                    "solidSelection":
                    list(pipeline_run.solids_to_execute)
                    if pipeline_run.solids_to_execute else None,
                },
                "executionMetadata": {
                    "runId": run_id
                },
                "stepKeys": step_keys,
            }
        }

        command = "dagster-graphql -v '{variables}' -p executePlan".format(
            variables=seven.json.dumps(variables))
        docker_image = docker_config["image"]
        client = docker.client.from_env()

        if docker_config.get("registry"):
            client.login(
                registry=docker_config["registry"]["url"],
                username=docker_config["registry"]["username"],
                password=docker_config["registry"]["password"],
            )

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            "Executing steps {} in Docker container {}".format(
                step_keys_str, docker_image),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, "Step keys"),
                    EventMetadataEntry.text(docker_image, "Image"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryDockerExecutor,
            step_key=step_keys[0],
        )

        events = [engine_event]

        docker_env = {}
        if docker_config.get("env_vars"):
            docker_env = {
                env_name: os.getenv(env_name)
                for env_name in docker_config["env_vars"]
            }

        try:
            docker_response = client.containers.run(
                docker_image,
                command=command,
                detach=False,
                auto_remove=True,
                # pass through this worker's environment for things like AWS creds etc.
                environment=docker_env,
            )
            res = seven.json.loads(docker_response)

        except docker.errors.ContainerError as err:
            instance.report_engine_event(
                "Failed to run steps {} in Docker container {}".format(
                    step_keys_str, docker_image),
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(docker_image, "Job image"),
                    EventMetadataEntry.text(err.stderr, "Docker stderr"),
                ], ),
                CeleryDockerExecutor,
                step_key=step_keys[0],
            )
            raise

        except JSONDecodeError:
            instance.report_engine_event(
                "Failed to parse response for steps {} from Docker container {}"
                .format(step_keys_str, docker_image),
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(docker_image, "Job image"),
                    EventMetadataEntry.text(docker_response,
                                            "Docker Response"),
                ], ),
                CeleryDockerExecutor,
                step_key=step_keys[0],
            )
            raise

        else:
            handle_execution_errors(res, "executePlan")
            step_events = handle_execute_plan_result(res)

        events += step_events

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
Example #23
0
    def _execute_step_docker(
        _self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        docker_config,
    ):
        '''Run step execution in a Docker container.
        '''
        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        step_keys_str = ", ".join(step_keys)

        variables = {
            'executionParams': {
                'runConfigData': run_config,
                'mode': mode,
                'selector': {
                    'repositoryLocationName': repo_location_name,
                    'repositoryName': repo_name,
                    'pipelineName': pipeline_run.pipeline_name,
                    'solidSelection': pipeline_run.solid_selection,
                },
                'executionMetadata': {
                    'runId': run_id
                },
                'stepKeys': step_keys,
            }
        }

        command = 'dagster-graphql -v \'{variables}\' -p executePlan'.format(
            variables=seven.json.dumps(variables))
        docker_image = docker_config['image']
        client = docker.client.from_env()

        if docker_config.get('registry'):
            client.login(
                registry=docker_config['registry']['url'],
                username=docker_config['registry']['username'],
                password=docker_config['registry']['password'],
            )

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            'Executing steps {} in Docker container {}'.format(
                step_keys_str, docker_image),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, 'Step keys'),
                    EventMetadataEntry.text(docker_image, 'Image'),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryDockerExecutor,
            step_key=step_keys[0],
        )

        events = [engine_event]

        docker_env = {}
        if docker_config.get('env_vars'):
            docker_env = {
                env_name: os.getenv(env_name)
                for env_name in docker_config['env_vars']
            }

        try:
            docker_response = client.containers.run(
                docker_image,
                command=command,
                detach=False,
                auto_remove=True,
                # pass through this worker's environment for things like AWS creds etc.
                environment=docker_env,
            )
            res = seven.json.loads(docker_response)

        except docker.errors.ContainerError as err:
            instance.report_engine_event(
                'Failed to run steps {} in Docker container {}'.format(
                    step_keys_str, docker_image),
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(docker_image, 'Job image'),
                    EventMetadataEntry.text(err.stderr, 'Docker stderr'),
                ], ),
                CeleryDockerExecutor,
                step_key=step_keys[0],
            )
            raise

        except JSONDecodeError:
            instance.report_engine_event(
                'Failed to parse response for steps {} from Docker container {}'
                .format(step_keys_str, docker_image),
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(docker_image, 'Job image'),
                    EventMetadataEntry.text(docker_response,
                                            'Docker Response'),
                ], ),
                CeleryDockerExecutor,
                step_key=step_keys[0],
            )
            raise

        else:
            handle_execution_errors(res, 'executePlan')
            step_events = handle_execute_plan_result(res)

        events += step_events

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
Example #24
0
    def _execute_step_k8s_job(
        self,
        execute_step_args_packed,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        user_defined_k8s_config_dict=None,
        kubeconfig_file=None,
    ):
        """Run step execution in a K8s job pod."""
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            ))
        check.inst_param(execute_step_args, "execute_step_args",
                         ExecuteStepArgs)
        check.invariant(
            len(execute_step_args.step_keys_to_execute) == 1,
            "Celery K8s task executor can only execute 1 step at a time",
        )

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, "job_config", DagsterK8sJobConfig)
        check.str_param(job_namespace, "job_namespace")

        check.bool_param(load_incluster_config, "load_incluster_config")

        user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(
            user_defined_k8s_config_dict)
        check.opt_inst_param(
            user_defined_k8s_config,
            "user_defined_k8s_config",
            UserDefinedDagsterK8sConfig,
        )
        check.opt_str_param(kubeconfig_file, "kubeconfig_file")

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)
        pipeline_run = instance.get_run_by_id(
            execute_step_args.pipeline_run_id)

        check.inst(
            pipeline_run,
            PipelineRun,
            "Could not load run {}".format(execute_step_args.pipeline_run_id),
        )
        step_key = execute_step_args.step_keys_to_execute[0]

        celery_worker_name = self.request.hostname
        celery_pod_name = os.environ.get("HOSTNAME")
        instance.report_engine_event(
            "Task for step {step_key} picked up by Celery".format(
                step_key=step_key),
            pipeline_run,
            EngineEventData([
                EventMetadataEntry.text(celery_worker_name,
                                        "Celery worker name"),
                EventMetadataEntry.text(celery_pod_name,
                                        "Celery worker Kubernetes Pod name"),
            ]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )

        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                "Not scheduling step because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, "Step key"),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        # Ensure we stay below k8s name length limits
        k8s_name_key = get_k8s_job_name(execute_step_args.pipeline_run_id,
                                        step_key)

        retry_state = execute_step_args.known_state.get_retry_state()

        if retry_state.get_attempt_count(step_key):
            attempt_number = retry_state.get_attempt_count(step_key)
            job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
            pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
        else:
            job_name = "dagster-job-%s" % (k8s_name_key)
            pod_name = "dagster-job-%s" % (k8s_name_key)

        input_json = serialize_dagster_namedtuple(execute_step_args)
        args = ["dagster", "api", "execute_step", input_json]

        job = construct_dagster_k8s_job(job_config, args, job_name,
                                        user_defined_k8s_config, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            "Executing step {} in Kubernetes job {}".format(
                step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_key, "Step key"),
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(job_config.job_image, "Job image"),
                    EventMetadataEntry.text(job_config.image_pull_policy,
                                            "Image pull policy"),
                    EventMetadataEntry.text(str(job_config.image_pull_secrets),
                                            "Image pull secrets"),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name),
                        "Service account name"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)
        try:
            kubernetes.client.BatchV1Api().create_namespaced_job(
                body=job, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            if e.reason == "Conflict":
                # There is an existing job with the same name so proceed and see if the existing job succeeded
                instance.report_engine_event(
                    "Did not create Kubernetes job {} for step {} since job name already "
                    "exists, proceeding with existing job.".format(
                        job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step key"),
                            EventMetadataEntry.text(job_name,
                                                    "Kubernetes Job name"),
                        ],
                        marker_end=DELEGATE_MARKER,
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            else:
                instance.report_engine_event(
                    "Encountered unexpected error while creating Kubernetes job {} for step {}, "
                    "exiting.".format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step key"),
                        ],
                        error=serializable_error_info_from_exc_info(
                            sys.exc_info()),
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
                return []

        try:
            wait_for_job_success(
                job_name=job_name,
                namespace=job_namespace,
                instance=instance,
                run_id=execute_step_args.pipeline_run_id,
            )
        except (DagsterK8sError, DagsterK8sTimeoutError) as err:
            step_failure_event = construct_step_failure_event_and_handle(
                pipeline_run, step_key, err, instance=instance)
            events.append(step_failure_event)
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                "Terminating Kubernetes Job because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, "Step key"),
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(job_namespace,
                                            "Kubernetes Job namespace"),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return []
        except (
                DagsterK8sUnrecoverableAPIError,
                DagsterK8sAPIRetryLimitExceeded,
                # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in
                # a retry boundary. We still catch it here just in case we missed one so that we can
                # report it to the event log
                kubernetes.client.rest.ApiException,
        ) as err:
            instance.report_engine_event(
                "Encountered unexpected error while waiting on Kubernetes job {} for step {}, "
                "exiting.".format(job_name, step_key),
                pipeline_run,
                EngineEventData(
                    [
                        EventMetadataEntry.text(step_key, "Step key"),
                    ],
                    error=serializable_error_info_from_exc_info(
                        sys.exc_info()),
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        try:
            pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            instance.report_engine_event(
                "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, "
                "exiting.".format(job_name, step_key),
                pipeline_run,
                EngineEventData(
                    [
                        EventMetadataEntry.text(step_key, "Step key"),
                    ],
                    error=serializable_error_info_from_exc_info(
                        sys.exc_info()),
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            "Retrieving logs from Kubernetes Job pods",
            pipeline_run,
            EngineEventData(
                [EventMetadataEntry.text("\n".join(pod_names), "Pod names")]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            try:
                raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
                logs += raw_logs.split("\n")
            except kubernetes.client.rest.ApiException as e:
                instance.report_engine_event(
                    "Encountered unexpected error while fetching pod logs for Kubernetes job {}, "
                    "Pod name {} for step {}. Will attempt to continue with other pods."
                    .format(job_name, pod_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step key"),
                        ],
                        error=serializable_error_info_from_exc_info(
                            sys.exc_info()),
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )

        events += filter_dagster_events_from_pod_logs(logs)
        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
Example #25
0
    def _execute_step_docker(
        self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        run_id,
        docker_config,
        pipeline_origin_packed,
        retries_dict,
    ):
        """Run step execution in a Docker container.
        """
        check.dict_param(instance_ref_dict, "instance_ref_dict")
        check.list_param(step_keys, "step_keys", of_type=str)
        check.dict_param(run_config, "run_config")
        check.str_param(mode, "mode")
        check.str_param(repo_name, "repo_name")
        check.str_param(run_id, "run_id")
        check.dict_param(docker_config, "docker_config")
        pipeline_origin = unpack_value(
            check.dict_param(pipeline_origin_packed, "pipeline_origin_packed"))
        check.dict_param(retries_dict, "retries_dict")

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, "Could not load run {}".format(run_id))

        step_keys_str = ", ".join(step_keys)

        input_json = serialize_dagster_namedtuple(
            ExecuteStepArgs(
                pipeline_origin=pipeline_origin,
                pipeline_run_id=run_id,
                instance_ref=instance_ref,
                mode=mode,
                step_keys_to_execute=step_keys,
                run_config=run_config,
                retries_dict=retries_dict,
            ))

        command = "dagster api execute_step_with_structured_logs {}".format(
            json.dumps(input_json))

        docker_image = docker_config["image"]

        client = docker.client.from_env()

        if docker_config.get("registry"):
            client.login(
                registry=docker_config["registry"]["url"],
                username=docker_config["registry"]["username"],
                password=docker_config["registry"]["password"],
            )

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            "Executing steps {} in Docker container {}".format(
                step_keys_str, docker_image),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, "Step keys"),
                    EventMetadataEntry.text(docker_image, "Image"),
                    EventMetadataEntry.text(self.request.hostname,
                                            "Celery worker"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryDockerExecutor,
            step_key=step_keys[0],
        )

        serialized_events = [serialize_dagster_namedtuple(engine_event)]

        docker_env = {}
        if docker_config.get("env_vars"):
            docker_env = {
                env_name: os.getenv(env_name)
                for env_name in docker_config["env_vars"]
            }

        try:
            docker_response = client.containers.run(
                docker_image,
                command=command,
                detach=False,
                auto_remove=True,
                # pass through this worker's environment for things like AWS creds etc.
                environment=docker_env,
                network=docker_config.get("network", None),
            )

            res = docker_response.decode("utf-8")
        except docker.errors.ContainerError as err:
            instance.report_engine_event(
                "Failed to run steps {} in Docker container {}".format(
                    step_keys_str, docker_image),
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(docker_image, "Job image"),
                    EventMetadataEntry.text(err.stderr, "Docker stderr"),
                ], ),
                CeleryDockerExecutor,
                step_key=step_keys[0],
            )
            raise
        else:
            if res is None:
                raise Exception(
                    "No response from execute_step_with_structured_logs in CeleryDockerExecutor"
                )

            serialized_events += [event for event in res.split("\n") if event]

        return serialized_events