Exemple #1
0
def write_unary_input(input_file, obj):
    check.str_param(input_file, 'input_file')
    check.not_none_param(obj, 'obj')
    with open(os.path.abspath(input_file), 'w') as fp:
        fp.write(serialize_dagster_namedtuple(obj))
Exemple #2
0
def _send(file_path, obj):
    with open(os.path.abspath(file_path), 'a+') as fp:
        fp.write(serialize_dagster_namedtuple(obj) + '\n')
Exemple #3
0
    def _execute_plan(self, execute_step_args_packed, executable_dict):
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            ))
        check.inst_param(execute_step_args, "execute_step_args",
                         ExecuteStepArgs)

        check.dict_param(executable_dict, "executable_dict")

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)

        pipeline = ReconstructablePipeline.from_dict(executable_dict)
        retry_mode = execute_step_args.retry_mode

        pipeline_run = instance.get_run_by_id(
            execute_step_args.pipeline_run_id)
        check.invariant(
            pipeline_run,
            "Could not load run {}".format(execute_step_args.pipeline_run_id))

        step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)

        execution_plan = create_execution_plan(
            pipeline,
            pipeline_run.run_config,
            mode=pipeline_run.mode,
            step_keys_to_execute=execute_step_args.step_keys_to_execute,
            known_state=execute_step_args.known_state,
        )

        engine_event = instance.report_engine_event(
            "Executing steps {} in celery worker".format(step_keys_str),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, "step_keys"),
                    EventMetadataEntry.text(self.request.hostname,
                                            "Celery worker"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryExecutor,
            step_key=execution_plan.step_handle_for_single_step_plans().to_key(
            ),
        )

        events = [engine_event]
        for step_event in execute_plan_iterator(
                execution_plan,
                pipeline=pipeline,
                pipeline_run=pipeline_run,
                run_config=pipeline_run.run_config,
                instance=instance,
                retry_mode=retry_mode,
        ):
            events.append(step_event)

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
def serialize_rt(value):
    return deserialize_json_to_dagster_namedtuple(serialize_dagster_namedtuple(value))
Exemple #5
0
 def GetCurrentImage(self, request, _context):
     return api_pb2.GetCurrentImageReply(
         serialized_current_image=serialize_dagster_namedtuple(
             GetCurrentImageResult(current_image=_get_current_image(),
                                   serializable_error_info=None)))
Exemple #6
0
    def _execute_step_k8s_job(
        self,
        execute_step_args_packed,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        user_defined_k8s_config_dict=None,
        kubeconfig_file=None,
    ):
        """Run step execution in a K8s job pod."""
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            ))
        check.inst_param(execute_step_args, "execute_step_args",
                         ExecuteStepArgs)
        check.invariant(
            len(execute_step_args.step_keys_to_execute) == 1,
            "Celery K8s task executor can only execute 1 step at a time",
        )

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, "job_config", DagsterK8sJobConfig)
        check.str_param(job_namespace, "job_namespace")

        check.bool_param(load_incluster_config, "load_incluster_config")

        user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(
            user_defined_k8s_config_dict)
        check.opt_inst_param(
            user_defined_k8s_config,
            "user_defined_k8s_config",
            UserDefinedDagsterK8sConfig,
        )
        check.opt_str_param(kubeconfig_file, "kubeconfig_file")

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)
        pipeline_run = instance.get_run_by_id(
            execute_step_args.pipeline_run_id)

        check.inst(
            pipeline_run,
            PipelineRun,
            "Could not load run {}".format(execute_step_args.pipeline_run_id),
        )
        step_key = execute_step_args.step_keys_to_execute[0]

        celery_worker_name = self.request.hostname
        celery_pod_name = os.environ.get("HOSTNAME")
        instance.report_engine_event(
            "Task for step {step_key} picked up by Celery".format(
                step_key=step_key),
            pipeline_run,
            EngineEventData([
                EventMetadataEntry.text(celery_worker_name,
                                        "Celery worker name"),
                EventMetadataEntry.text(celery_pod_name,
                                        "Celery worker Kubernetes Pod name"),
            ]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )

        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                "Not scheduling step because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, "Step key"),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        # Ensure we stay below k8s name length limits
        k8s_name_key = get_k8s_job_name(execute_step_args.pipeline_run_id,
                                        step_key)

        retries = Retries.from_config(execute_step_args.retries_dict)

        if retries.get_attempt_count(step_key):
            attempt_number = retries.get_attempt_count(step_key)
            job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
            pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
        else:
            job_name = "dagster-job-%s" % (k8s_name_key)
            pod_name = "dagster-job-%s" % (k8s_name_key)

        input_json = serialize_dagster_namedtuple(execute_step_args)
        args = ["dagster", "api", "execute_step", input_json]

        job = construct_dagster_k8s_job(job_config, args, job_name,
                                        user_defined_k8s_config, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            "Executing step {} in Kubernetes job {}".format(
                step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_key, "Step key"),
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(job_config.job_image, "Job image"),
                    EventMetadataEntry.text(job_config.image_pull_policy,
                                            "Image pull policy"),
                    EventMetadataEntry.text(str(job_config.image_pull_secrets),
                                            "Image pull secrets"),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name),
                        "Service account name"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)
        try:
            kubernetes.client.BatchV1Api().create_namespaced_job(
                body=job, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            if e.reason == "Conflict":
                # There is an existing job with the same name so proceed and see if the existing job succeeded
                instance.report_engine_event(
                    "Did not create Kubernetes job {} for step {} since job name already "
                    "exists, proceeding with existing job.".format(
                        job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step key"),
                            EventMetadataEntry.text(job_name,
                                                    "Kubernetes Job name"),
                        ],
                        marker_end=DELEGATE_MARKER,
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            else:
                instance.report_engine_event(
                    "Encountered unexpected error while creating Kubernetes job {} for step {}, "
                    "exiting.".format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step key"),
                        ],
                        error=serializable_error_info_from_exc_info(
                            sys.exc_info()),
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
                return []

        try:
            wait_for_job_success(
                job_name=job_name,
                namespace=job_namespace,
                instance=instance,
                run_id=execute_step_args.pipeline_run_id,
            )
        except (DagsterK8sError, DagsterK8sTimeoutError) as err:
            step_failure_event = construct_step_failure_event_and_handle(
                pipeline_run, step_key, err, instance=instance)
            events.append(step_failure_event)
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                "Terminating Kubernetes Job because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, "Step key"),
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(job_namespace,
                                            "Kubernetes Job namespace"),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return []
        except (
                DagsterK8sUnrecoverableAPIError,
                DagsterK8sAPIRetryLimitExceeded,
                # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in
                # a retry boundary. We still catch it here just in case we missed one so that we can
                # report it to the event log
                kubernetes.client.rest.ApiException,
        ) as err:
            instance.report_engine_event(
                "Encountered unexpected error while waiting on Kubernetes job {} for step {}, "
                "exiting.".format(job_name, step_key),
                pipeline_run,
                EngineEventData(
                    [
                        EventMetadataEntry.text(step_key, "Step key"),
                    ],
                    error=serializable_error_info_from_exc_info(
                        sys.exc_info()),
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        try:
            pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            instance.report_engine_event(
                "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, "
                "exiting.".format(job_name, step_key),
                pipeline_run,
                EngineEventData(
                    [
                        EventMetadataEntry.text(step_key, "Step key"),
                    ],
                    error=serializable_error_info_from_exc_info(
                        sys.exc_info()),
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            "Retrieving logs from Kubernetes Job pods",
            pipeline_run,
            EngineEventData(
                [EventMetadataEntry.text("\n".join(pod_names), "Pod names")]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            try:
                raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
                logs += raw_logs.split("\n")
            except kubernetes.client.rest.ApiException as e:
                instance.report_engine_event(
                    "Encountered unexpected error while fetching pod logs for Kubernetes job {}, "
                    "Pod name {} for step {}. Will attempt to continue with other pods."
                    .format(job_name, pod_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step key"),
                        ],
                        error=serializable_error_info_from_exc_info(
                            sys.exc_info()),
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )

        events += filter_dagster_events_from_pod_logs(logs)
        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
Exemple #7
0
    def launch_run(self, context: LaunchRunContext) -> None:
        run = context.pipeline_run

        job_name = get_job_name_from_run_id(run.run_id)
        pod_name = job_name
        exc_config = _get_validated_celery_k8s_executor_config(run.run_config)
        env_vars = None

        job_image_from_executor_config = exc_config.get("job_image")

        pipeline_origin = context.pipeline_code_origin
        repository_origin = pipeline_origin.repository_origin

        job_image = repository_origin.container_image

        if job_image:
            if job_image_from_executor_config:
                job_image = job_image_from_executor_config
                self._instance.report_engine_event(
                    f"You have specified a job_image {job_image_from_executor_config} in your executor configuration, "
                    f"but also {job_image} in your user-code deployment. Using the job image {job_image_from_executor_config} "
                    f"from executor configuration as it takes precedence.",
                    run,
                    cls=self.__class__,
                )
        else:
            if not job_image_from_executor_config:
                raise DagsterInvariantViolationError(
                    "You have not specified a job_image in your executor configuration. "
                    "To resolve this error, specify the job_image configuration in the executor "
                    "config section in your run config. \n"
                    "Note: You may also be seeing this error because you are using the configured API. "
                    "Using configured with the celery-k8s executor is not supported at this time, "
                    "and the job_image must be configured at the top-level executor config without "
                    "using configured."
                )

            job_image = job_image_from_executor_config

        job_config = DagsterK8sJobConfig(
            dagster_home=self.dagster_home,
            instance_config_map=self.instance_config_map,
            postgres_password_secret=self.postgres_password_secret,
            job_image=check.str_param(job_image, "job_image"),
            image_pull_policy=exc_config.get("image_pull_policy"),
            image_pull_secrets=exc_config.get("image_pull_secrets"),
            service_account_name=exc_config.get("service_account_name"),
            env_config_maps=exc_config.get("env_config_maps"),
            env_secrets=exc_config.get("env_secrets"),
        )

        self._instance.add_run_tags(
            run.run_id,
            {DOCKER_IMAGE_TAG: job_config.job_image},
        )

        user_defined_k8s_config = get_user_defined_k8s_config(frozentags(run.tags))

        from dagster.cli.api import ExecuteRunArgs

        input_json = serialize_dagster_namedtuple(
            # depends on DagsterInstance.get() returning the same instance
            # https://github.com/dagster-io/dagster/issues/2757
            ExecuteRunArgs(
                pipeline_origin=pipeline_origin,
                pipeline_run_id=run.run_id,
                instance_ref=None,
            )
        )

        job = construct_dagster_k8s_job(
            job_config,
            args=["dagster", "api", "execute_run", input_json],
            job_name=job_name,
            pod_name=pod_name,
            component="run_coordinator",
            user_defined_k8s_config=user_defined_k8s_config,
            env_vars=env_vars,
        )

        job_namespace = exc_config.get("job_namespace")

        self._instance.report_engine_event(
            "Creating Kubernetes run worker job",
            run,
            EngineEventData(
                [
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(job_namespace, "Kubernetes Namespace"),
                    EventMetadataEntry.text(run.run_id, "Run ID"),
                ]
            ),
            cls=self.__class__,
        )

        self._batch_api.create_namespaced_job(body=job, namespace=job_namespace)
        self._instance.report_engine_event(
            "Kubernetes run worker job created",
            run,
            EngineEventData(
                [
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(job_namespace, "Kubernetes Namespace"),
                    EventMetadataEntry.text(run.run_id, "Run ID"),
                ]
            ),
            cls=self.__class__,
        )
def test_execute_step_verify_step():
    with get_foo_pipeline_handle() as pipeline_handle:
        runner = CliRunner()

        with instance_for_test(
                overrides={
                    "compute_logs": {
                        "module":
                        "dagster.core.storage.noop_compute_log_manager",
                        "class": "NoOpComputeLogManager",
                    }
                }) as instance:
            run = create_run_for_test(
                instance,
                pipeline_name="foo",
                run_id="new_run",
                run_config={"storage": {
                    "filesystem": {}
                }},
            )

            input_json = serialize_dagster_namedtuple(
                ExecuteStepArgs(
                    pipeline_origin=pipeline_handle.get_python_origin(),
                    pipeline_run_id=run.run_id,
                    step_keys_to_execute=None,
                    instance_ref=instance.get_ref(),
                ))

            # Check that verify succeeds for step that has hasn't been fun (case 3)
            retries = Retries.from_config({"enabled": {}})
            assert verify_step(instance,
                               run,
                               retries,
                               step_keys_to_execute=["do_something"])

            # Check that verify fails when trying to retry with no original attempt (case 3)
            retries = Retries.from_config({"enabled": {}})
            retries.mark_attempt("do_something")
            assert not verify_step(
                instance, run, retries, step_keys_to_execute=["do_something"])

            # Test trying to re-run a retry fails verify_step (case 2)
            with mock.patch("dagster.cli.api.get_step_stats_by_key"
                            ) as _step_stats_by_key:
                _step_stats_by_key.return_value = {
                    "do_something":
                    RunStepKeyStatsSnapshot(run_id=run.run_id,
                                            step_key="do_something",
                                            attempts=2)
                }

                retries = Retries.from_config({"enabled": {}})
                retries.mark_attempt("do_something")
                assert not verify_step(instance,
                                       run,
                                       retries,
                                       step_keys_to_execute=["do_something"])

            runner_execute_step(
                runner,
                [input_json],
            )

            # # Check that verify fails for step that has already run (case 1)
            retries = Retries.from_config({"enabled": {}})
            assert not verify_step(
                instance, run, retries, step_keys_to_execute=["do_something"])
Exemple #9
0
 def send_to_buffer(event):
     buffer.append(serialize_dagster_namedtuple(event))
def mock_external_repository_data():
    external_repo_data = external_repository_data_from_def(noop_repo())
    return serialize_dagster_namedtuple(external_repo_data)
Exemple #11
0
    def launch_run(self, instance, run, external_pipeline):
        check.inst_param(instance, "instance", DagsterInstance)
        check.inst_param(run, "run", PipelineRun)
        check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline)

        job_name = get_job_name_from_run_id(run.run_id)
        pod_name = job_name
        exc_config = _get_validated_celery_k8s_executor_config(run.run_config)

        job_image = None
        pipeline_origin = None
        env_vars = None
        if isinstance(external_pipeline.get_origin(), PipelineGrpcServerOrigin):
            if exc_config.get("job_image"):
                raise DagsterInvariantViolationError(
                    "Cannot specify job_image in executor config when loading pipeline "
                    "from GRPC server."
                )

            repository_location_handle = (
                external_pipeline.repository_handle.repository_location_handle
            )

            if not isinstance(repository_location_handle, GrpcServerRepositoryLocationHandle):
                raise DagsterInvariantViolationError(
                    "Expected RepositoryLocationHandle to be of type "
                    "GrpcServerRepositoryLocationHandle but found type {}".format(
                        type(repository_location_handle)
                    )
                )

            job_image = repository_location_handle.get_current_image()
            env_vars = {"DAGSTER_CURRENT_IMAGE": job_image}

            repository_name = external_pipeline.repository_handle.repository_name
            pipeline_origin = PipelinePythonOrigin(
                pipeline_name=external_pipeline.name,
                repository_origin=repository_location_handle.get_repository_python_origin(
                    repository_name
                ),
            )

        else:
            job_image = exc_config.get("job_image")
            if not job_image:
                raise DagsterInvariantViolationError(
                    "Cannot find job_image in celery-k8s executor config."
                )
            pipeline_origin = external_pipeline.get_origin()

        job_config = DagsterK8sJobConfig(
            dagster_home=self.dagster_home,
            instance_config_map=self.instance_config_map,
            postgres_password_secret=self.postgres_password_secret,
            job_image=check.str_param(job_image, "job_image"),
            image_pull_policy=exc_config.get("image_pull_policy"),
            image_pull_secrets=exc_config.get("image_pull_secrets"),
            service_account_name=exc_config.get("service_account_name"),
            env_config_maps=exc_config.get("env_config_maps"),
            env_secrets=exc_config.get("env_secrets"),
        )

        user_defined_k8s_config = get_user_defined_k8s_config(frozentags(external_pipeline.tags))

        from dagster.cli.api import ExecuteRunArgs

        input_json = serialize_dagster_namedtuple(
            # depends on DagsterInstance.get() returning the same instance
            # https://github.com/dagster-io/dagster/issues/2757
            ExecuteRunArgs(
                pipeline_origin=pipeline_origin, pipeline_run_id=run.run_id, instance_ref=None,
            )
        )

        job = construct_dagster_k8s_job(
            job_config,
            command=["dagster"],
            args=["api", "execute_run_with_structured_logs", input_json],
            job_name=job_name,
            pod_name=pod_name,
            component="run_coordinator",
            user_defined_k8s_config=user_defined_k8s_config,
            env_vars=env_vars,
        )

        job_namespace = exc_config.get("job_namespace")

        api = kubernetes.client.BatchV1Api()
        api.create_namespaced_job(body=job, namespace=job_namespace)

        self._instance.report_engine_event(
            "Kubernetes run_coordinator job launched",
            run,
            EngineEventData(
                [
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(pod_name, "Kubernetes Pod name"),
                    EventMetadataEntry.text(job_namespace, "Kubernetes Namespace"),
                    EventMetadataEntry.text(run.run_id, "Run ID"),
                ]
            ),
            cls=self.__class__,
        )
        return run
Exemple #12
0
def test_solid_definition_kitchen_sink():
    @solid(
        input_defs=[
            InputDefinition('arg_one', str, description='desc1'),
            InputDefinition('arg_two', int),
        ],
        output_defs=[
            OutputDefinition(name='output_one', dagster_type=str),
            OutputDefinition(
                name='output_two', dagster_type=int, description='desc2', is_required=False
            ),
        ],
        config={'foo': int},
        description='a description',
        tags={'a_tag': 'yup'},
        required_resource_keys={'b_resource', 'a_resource'},
    )
    def kitchen_sink_solid(_, arg_two, arg_one):  # out of order to test positional_inputs
        assert arg_one
        assert arg_two
        raise Exception('should not execute')

    kitchen_sink_solid_snap = build_core_solid_def_snap(kitchen_sink_solid)

    assert kitchen_sink_solid_snap
    assert kitchen_sink_solid_snap.name == 'kitchen_sink_solid'
    assert len(kitchen_sink_solid_snap.input_def_snaps) == 2
    assert [inp.name for inp in kitchen_sink_solid_snap.input_def_snaps] == ['arg_one', 'arg_two']
    assert [inp.dagster_type_key for inp in kitchen_sink_solid_snap.input_def_snaps] == [
        'String',
        'Int',
    ]

    assert kitchen_sink_solid_snap.get_input_snap('arg_one').description == 'desc1'

    assert [out.name for out in kitchen_sink_solid_snap.output_def_snaps] == [
        'output_one',
        'output_two',
    ]

    assert [out.dagster_type_key for out in kitchen_sink_solid_snap.output_def_snaps] == [
        'String',
        'Int',
    ]

    assert kitchen_sink_solid_snap.get_output_snap('output_two').description == 'desc2'
    assert kitchen_sink_solid_snap.get_output_snap('output_two').is_required is False

    assert (
        kitchen_sink_solid_snap.config_field_snap.type_key
        == kitchen_sink_solid.config_field.config_type.key
    )

    assert kitchen_sink_solid_snap.required_resource_keys == ['a_resource', 'b_resource']
    assert kitchen_sink_solid_snap.tags == {'a_tag': 'yup'}
    assert kitchen_sink_solid.positional_inputs == ['arg_two', 'arg_one']

    assert (
        deserialize_json_to_dagster_namedtuple(
            serialize_dagster_namedtuple(kitchen_sink_solid_snap)
        )
        == kitchen_sink_solid_snap
    )
Exemple #13
0
    def launch_run(self, instance, run, external_pipeline):
        check.inst_param(run, 'run', PipelineRun)
        check.inst_param(external_pipeline, 'external_pipeline',
                         ExternalPipeline)

        job_name = 'dagster-run-{}'.format(run.run_id)
        pod_name = job_name

        resources = get_k8s_resource_requirements(
            frozentags(external_pipeline.tags))

        pipeline_origin = None
        job_config = None
        if isinstance(external_pipeline.get_origin(),
                      PipelineGrpcServerOrigin):
            if self._job_image:
                raise DagsterInvariantViolationError(
                    'Cannot specify job_image in run launcher config when loading pipeline '
                    'from GRPC server.')

            repository_location_handle = (
                external_pipeline.repository_handle.repository_location_handle)

            if not isinstance(repository_location_handle,
                              GrpcServerRepositoryLocationHandle):
                raise DagsterInvariantViolationError(
                    'Expected RepositoryLocationHandle to be of type '
                    'GrpcServerRepositoryLocationHandle but found type {}'.
                    format(type(repository_location_handle)))

            job_image = repository_location_handle.get_current_image()

            job_config = self._get_grpc_job_config(job_image)

            repository_name = external_pipeline.repository_handle.repository_name
            pipeline_origin = PipelinePythonOrigin(
                pipeline_name=external_pipeline.name,
                repository_origin=repository_location_handle.
                get_repository_python_origin(repository_name),
            )
        else:
            pipeline_origin = external_pipeline.get_origin()
            job_config = self._get_static_job_config()

        input_json = serialize_dagster_namedtuple(
            ExecuteRunArgs(
                pipeline_origin=pipeline_origin,
                pipeline_run_id=run.run_id,
                instance_ref=None,
            ))

        job = construct_dagster_k8s_job(
            job_config=job_config,
            command=['dagster'],
            args=['api', 'execute_run_with_structured_logs', input_json],
            job_name=job_name,
            pod_name=pod_name,
            component='run_coordinator',
            resources=resources,
        )

        self._batch_api.create_namespaced_job(body=job,
                                              namespace=self.job_namespace)
        self._instance.report_engine_event(
            'Kubernetes run_coordinator job launched',
            run,
            EngineEventData([
                EventMetadataEntry.text(job_name, 'Kubernetes Job name'),
                EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'),
                EventMetadataEntry.text(self.job_namespace,
                                        'Kubernetes Namespace'),
                EventMetadataEntry.text(run.run_id, 'Run ID'),
            ]),
            cls=K8sRunLauncher,
        )
        return run
Exemple #14
0
 def to_json(self) -> str:
     return serialize_dagster_namedtuple(cast(NamedTuple, self))
Exemple #15
0
    def ExternalPartitionSetExecutionParams(self, request, _context):
        partition_set_execution_param_args = deserialize_json_to_dagster_namedtuple(
            request.serialized_partition_set_execution_param_args)

        check.inst_param(
            partition_set_execution_param_args,
            "partition_set_execution_param_args",
            PartitionSetExecutionParamArgs,
        )

        recon_repo = self._recon_repository_from_origin(
            partition_set_execution_param_args.repository_origin)
        definition = recon_repo.get_definition()
        partition_set_def = definition.get_partition_set_def(
            partition_set_execution_param_args.partition_set_name)

        try:
            with user_code_error_boundary(
                    PartitionExecutionError,
                    lambda:
                    "Error occurred during the partition generation for partition set "
                    "{partition_set_name}".format(partition_set_name=
                                                  partition_set_def.name),
            ):
                all_partitions = partition_set_def.get_partitions()
            partitions = [
                partition for partition in all_partitions if partition.name in
                partition_set_execution_param_args.partition_names
            ]

            partition_data = []
            for partition in partitions:

                def _error_message_fn(partition_set_name, partition_name):
                    return lambda: (
                        "Error occurred during the partition config and tag generation for "
                        "partition set {partition_set_name}::{partition_name}".
                        format(partition_set_name=partition_set_name,
                               partition_name=partition_name))

                with user_code_error_boundary(
                        PartitionExecutionError,
                        _error_message_fn(partition_set_def.name,
                                          partition.name),
                ):
                    run_config = partition_set_def.run_config_for_partition(
                        partition)
                    tags = partition_set_def.tags_for_partition(partition)

                partition_data.append(
                    ExternalPartitionExecutionParamData(
                        name=partition.name,
                        tags=tags,
                        run_config=run_config,
                    ))

            return api_pb2.ExternalPartitionSetExecutionParamsReply(
                serialized_external_partition_set_execution_param_data_or_external_partition_execution_error
                =serialize_dagster_namedtuple(
                    ExternalPartitionSetExecutionParamData(
                        partition_data=partition_data)))

        except PartitionExecutionError:
            return api_pb2.ExternalPartitionSetExecutionParamsReply(
                ExternalPartitionExecutionErrorData(
                    serializable_error_info_from_exc_info(sys.exc_info())))
Exemple #16
0
    def _execute_step_k8s_job(
        _self,
        instance_ref_dict,
        step_keys,
        environment_dict,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        resources=None,
        kubeconfig_file=None,
    ):
        '''Run step execution in a K8s job pod.
        '''
        from dagster_k8s import DagsterK8sJobConfig, construct_dagster_graphql_k8s_job
        from dagster_k8s.utils import get_pod_names_in_job, retrieve_pod_logs, wait_for_job_success

        import kubernetes

        check.dict_param(instance_ref_dict, 'instance_ref_dict')
        check.list_param(step_keys, 'step_keys', of_type=str)
        check.invariant(
            len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time'
        )
        check.dict_param(environment_dict, 'environment_dict')
        check.str_param(mode, 'mode')
        check.str_param(repo_name, 'repo_name')
        check.str_param(repo_location_name, 'repo_location_name')
        check.str_param(run_id, 'run_id')

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, 'job_config', DagsterK8sJobConfig)
        check.str_param(job_namespace, 'job_namespace')
        check.bool_param(load_incluster_config, 'load_incluster_config')
        resources = check.opt_inst_param(
            resources, 'resources', kubernetes.client.V1ResourceRequirements
        )
        check.opt_str_param(kubeconfig_file, 'kubeconfig_file')

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        step_keys_str = ", ".join(step_keys)

        # Ensure we stay below k8s name length limits
        k8s_name_key = _get_k8s_name_key(run_id, step_keys)
        job_name = 'dagster-stepjob-%s' % k8s_name_key
        pod_name = 'dagster-stepjob-%s' % k8s_name_key

        variables = {
            'executionParams': {
                'runConfigData': environment_dict,
                'mode': mode,
                'selector': {
                    'repositoryLocationName': repo_location_name,
                    'repositoryName': repo_name,
                    'pipelineName': pipeline_run.pipeline_name,
                },
                'executionMetadata': {'runId': run_id},
                'stepKeys': step_keys,
            }
        }

        args = ['-p', 'executePlan', '-v', seven.json.dumps(variables)]

        job = construct_dagster_graphql_k8s_job(job_config, args, job_name, resources, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            'Executing steps {} in Kubernetes job {}'.format(step_keys_str, job.metadata.name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, 'Step keys'),
                    EventMetadataEntry.text(job.metadata.name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'),
                    EventMetadataEntry.text(job_config.job_image, 'Job image'),
                    EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'),
                    EventMetadataEntry.text(
                        str(job_config.image_pull_secrets), 'Image pull secrets'
                    ),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name), 'Service account name'
                    ),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobEngine,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_keys[0],
        )
        events.append(engine_event)

        kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace)

        wait_for_job_success(job.metadata.name, namespace=job_namespace)
        pod_names = get_pod_names_in_job(job.metadata.name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            'Retrieving logs from Kubernetes Job pods',
            pipeline_run,
            EngineEventData([EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]),
            CeleryK8sJobEngine,
            step_key=step_keys[0],
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split('\n')

        res = parse_raw_log_lines(logs)

        handle_execution_errors(res, 'executePlan')
        step_events = handle_execute_plan_result(res)

        events += step_events

        serialized_events = [serialize_dagster_namedtuple(event) for event in events]
        return serialized_events
Exemple #17
0
    def ExecuteRun(self, request, _context):
        try:
            execute_run_args = deserialize_json_to_dagster_namedtuple(
                request.serialized_execute_run_args)
            check.inst_param(execute_run_args, "execute_run_args",
                             ExecuteRunArgs)

            run_id = execute_run_args.pipeline_run_id

            recon_pipeline = self._recon_pipeline_from_origin(
                execute_run_args.pipeline_origin)

        except:  # pylint: disable=bare-except
            yield api_pb2.ExecuteRunEvent(
                serialized_dagster_event_or_ipc_error_message=
                serialize_dagster_namedtuple(
                    IPCErrorMessage(
                        serializable_error_info=
                        serializable_error_info_from_exc_info(sys.exc_info()),
                        message="Error during RPC setup for ExecuteRun",
                    )))
            return

        event_queue = multiprocessing.Queue()
        termination_event = multiprocessing.Event()
        execution_process = multiprocessing.Process(
            target=execute_run_in_subprocess,
            args=[
                request.serialized_execute_run_args,
                recon_pipeline,
                event_queue,
                termination_event,
            ],
        )
        with self._execution_lock:
            execution_process.start()
            self._executions[run_id] = (
                execution_process,
                execute_run_args.instance_ref,
            )
            self._termination_events[run_id] = termination_event

        done = False
        while not done:
            try:
                # We use `get_nowait()` instead of `get()` so that we can handle the case where the
                # execution process has died unexpectedly -- `get()` would hang forever in that case
                dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait(
                )
            except queue.Empty:
                if not execution_process.is_alive():
                    # subprocess died unexpectedly
                    yield api_pb2.ExecuteRunEvent(
                        serialized_dagster_event_or_ipc_error_message=
                        serialize_dagster_namedtuple(
                            IPCErrorMessage(
                                serializable_error_info=
                                serializable_error_info_from_exc_info(
                                    sys.exc_info()),
                                message=
                                ("GRPC server: Subprocess for {run_id} terminated unexpectedly"
                                 ).format(run_id=run_id),
                            )))
                    done = True
                time.sleep(EVENT_QUEUE_POLL_INTERVAL)
            else:
                if isinstance(dagster_event_or_ipc_error_message_or_done,
                              RunInSubprocessComplete):
                    done = True
                elif isinstance(dagster_event_or_ipc_error_message_or_done,
                                StartRunInSubprocessSuccessful):
                    continue
                else:
                    yield api_pb2.ExecuteRunEvent(
                        serialized_dagster_event_or_ipc_error_message=
                        serialize_dagster_namedtuple(
                            dagster_event_or_ipc_error_message_or_done))

        with self._execution_lock:
            if run_id in self._executions:
                del self._executions[run_id]
            if run_id in self._termination_events:
                del self._termination_events[run_id]
Exemple #18
0
    def StartRun(self, request, _context):
        if self._shutdown_once_executions_finish_event.is_set():
            return api_pb2.StartRunReply(
                serialized_start_run_result=serialize_dagster_namedtuple(
                    StartRunResult(
                        success=False,
                        message=
                        "Tried to start a run on a server after telling it to shut down",
                        serializable_error_info=None,
                    )))

        try:
            execute_run_args = check.inst(
                deserialize_json_to_dagster_namedtuple(
                    request.serialized_execute_run_args),
                ExecuteExternalPipelineArgs,
            )
            run_id = execute_run_args.pipeline_run_id
            recon_pipeline = self._recon_pipeline_from_origin(
                execute_run_args.pipeline_origin)

        except:
            return api_pb2.StartRunReply(
                serialized_start_run_result=serialize_dagster_namedtuple(
                    StartRunResult(
                        success=False,
                        message=None,
                        serializable_error_info=
                        serializable_error_info_from_exc_info(sys.exc_info()),
                    )))

        event_queue = self._mp_ctx.Queue()
        termination_event = self._mp_ctx.Event()
        execution_process = self._mp_ctx.Process(
            target=start_run_in_subprocess,
            args=[
                request.serialized_execute_run_args,
                recon_pipeline,
                event_queue,
                termination_event,
            ],
        )

        with self._execution_lock:
            execution_process.start()
            self._executions[run_id] = (
                execution_process,
                execute_run_args.instance_ref,
            )
            self._termination_events[run_id] = termination_event

        success = None
        message = None
        serializable_error_info = None

        while success is None:
            sleep(EVENT_QUEUE_POLL_INTERVAL)
            # We use `get_nowait()` instead of `get()` so that we can handle the case where the
            # execution process has died unexpectedly -- `get()` would hang forever in that case
            try:
                dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait(
                )
            except queue.Empty:
                if not execution_process.is_alive():
                    # subprocess died unexpectedly
                    success = False
                    message = (
                        "GRPC server: Subprocess for {run_id} terminated unexpectedly with "
                        "exit code {exit_code}".format(
                            run_id=run_id,
                            exit_code=execution_process.exitcode,
                        ))
                    serializable_error_info = serializable_error_info_from_exc_info(
                        sys.exc_info())
            else:
                if isinstance(dagster_event_or_ipc_error_message_or_done,
                              StartRunInSubprocessSuccessful):
                    success = True
                elif isinstance(dagster_event_or_ipc_error_message_or_done,
                                RunInSubprocessComplete):
                    continue
                if isinstance(dagster_event_or_ipc_error_message_or_done,
                              IPCErrorMessage):
                    success = False
                    message = dagster_event_or_ipc_error_message_or_done.message
                    serializable_error_info = (
                        dagster_event_or_ipc_error_message_or_done.
                        serializable_error_info)

        # Ensure that if the run failed, we remove it from the executions map before
        # returning so that CanCancel will never return True
        if not success:
            with self._execution_lock:
                self._clear_run(run_id)

        return api_pb2.StartRunReply(
            serialized_start_run_result=serialize_dagster_namedtuple(
                StartRunResult(
                    success=success,
                    message=message,
                    serializable_error_info=serializable_error_info,
                )))
Exemple #19
0
    def execute_run(self, execute_run_args):
        check.inst_param(execute_run_args, "execute_run_args",
                         ExecuteExternalPipelineArgs)

        with DagsterInstance.from_ref(
                execute_run_args.instance_ref) as instance:
            try:
                pipeline_run = instance.get_run_by_id(
                    execute_run_args.pipeline_run_id)
                event_iterator = self._streaming_query(
                    "ExecuteRun",
                    api_pb2.ExecuteRunRequest,
                    serialized_execute_run_args=serialize_dagster_namedtuple(
                        execute_run_args),
                )
            except Exception as exc:  # pylint: disable=bare-except
                yield instance.report_engine_event(
                    message="Unexpected error in IPC client",
                    pipeline_run=pipeline_run,
                    engine_event_data=EngineEventData.engine_error(
                        serializable_error_info_from_exc_info(sys.exc_info())),
                )
                raise exc

            try:
                for event in event_iterator:
                    yield deserialize_json_to_dagster_namedtuple(
                        event.serialized_dagster_event_or_ipc_error_message)
            except KeyboardInterrupt:
                self.cancel_execution(
                    CancelExecutionRequest(
                        run_id=execute_run_args.pipeline_run_id))
                raise
            except grpc.RpcError as rpc_error:
                if (
                        # posix
                        "Socket closed" in rpc_error.debug_error_string()  # pylint: disable=no-member
                        # windows
                        or "Stream removed" in rpc_error.debug_error_string()  # pylint: disable=no-member
                ):
                    yield instance.report_engine_event(
                        message=
                        "User process: GRPC server for {run_id} terminated unexpectedly"
                        .format(run_id=pipeline_run.run_id),
                        pipeline_run=pipeline_run,
                        engine_event_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info())),
                    )
                    yield instance.report_run_failed(pipeline_run)
                else:
                    yield instance.report_engine_event(
                        message="Unexpected error in IPC client",
                        pipeline_run=pipeline_run,
                        engine_event_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info())),
                    )
                raise rpc_error
            except Exception as exc:  # pylint: disable=bare-except
                yield instance.report_engine_event(
                    message="Unexpected error in IPC client",
                    pipeline_run=pipeline_run,
                    engine_event_data=EngineEventData.engine_error(
                        serializable_error_info_from_exc_info(sys.exc_info())),
                )
                raise exc
Exemple #20
0
    def launch_run(self, instance, run, external_pipeline):
        check.inst_param(run, "run", PipelineRun)
        check.inst_param(external_pipeline, "external_pipeline",
                         ExternalPipeline)

        job_name = "dagster-run-{}".format(run.run_id)
        pod_name = job_name

        user_defined_k8s_config = get_user_defined_k8s_config(
            frozentags(run.tags))

        pipeline_origin = None
        job_config = None
        if isinstance(
                external_pipeline.get_external_origin().
                external_repository_origin.repository_location_origin,
                GrpcServerRepositoryLocationOrigin,
        ):
            if self._job_image:
                raise DagsterInvariantViolationError(
                    "Cannot specify job_image in run launcher config when loading pipeline "
                    "from GRPC server.")

            repository_location_handle = (
                external_pipeline.repository_handle.repository_location_handle)

            if not isinstance(repository_location_handle,
                              GrpcServerRepositoryLocationHandle):
                raise DagsterInvariantViolationError(
                    "Expected RepositoryLocationHandle to be of type "
                    "GrpcServerRepositoryLocationHandle but found type {}".
                    format(type(repository_location_handle)))

            repository_name = external_pipeline.repository_handle.repository_name

            repository_origin = repository_location_handle.reload_repository_python_origin(
                repository_name)

            job_image = repository_origin.container_image

            pipeline_origin = PipelinePythonOrigin(
                pipeline_name=external_pipeline.name,
                repository_origin=repository_origin)

            job_config = self._get_grpc_job_config(job_image)
        else:
            pipeline_origin = external_pipeline.get_python_origin()
            job_config = self._get_static_job_config()

        input_json = serialize_dagster_namedtuple(
            ExecuteRunArgs(
                pipeline_origin=pipeline_origin,
                pipeline_run_id=run.run_id,
                instance_ref=None,
            ))

        job = construct_dagster_k8s_job(
            job_config=job_config,
            args=["dagster", "api", "execute_run", input_json],
            job_name=job_name,
            pod_name=pod_name,
            component="run_coordinator",
            user_defined_k8s_config=user_defined_k8s_config,
        )

        self._batch_api.create_namespaced_job(body=job,
                                              namespace=self.job_namespace)
        self._instance.report_engine_event(
            "Kubernetes run_coordinator job launched",
            run,
            EngineEventData([
                EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                EventMetadataEntry.text(self.job_namespace,
                                        "Kubernetes Namespace"),
                EventMetadataEntry.text(run.run_id, "Run ID"),
            ]),
            cls=self.__class__,
        )
        return run
Exemple #21
0
    def launch_run(self, run, external_pipeline):
        check.inst_param(run, "run", PipelineRun)
        check.inst_param(external_pipeline, "external_pipeline",
                         ExternalPipeline)

        docker_image = external_pipeline.get_python_origin(
        ).repository_origin.container_image

        if not docker_image:
            docker_image = self._image

        if not docker_image:
            raise Exception(
                "No docker image specified by the instance config or repository"
            )

        try:
            # validate that the docker image name is valid
            reference.Reference.parse(docker_image)
        except Exception as e:
            raise Exception(
                "Docker image name {docker_image} is not correctly formatted".
                format(docker_image=docker_image)) from e

        input_json = serialize_dagster_namedtuple(
            ExecuteRunArgs(
                pipeline_origin=external_pipeline.get_python_origin(),
                pipeline_run_id=run.run_id,
                instance_ref=self._instance.get_ref(),
            ))

        command = "dagster api execute_run_with_structured_logs {}".format(
            json.dumps(input_json))

        docker_env = (
            {env_name: os.getenv(env_name)
             for env_name in self._env_vars} if self._env_vars else {})

        client = self._get_client()

        try:
            container = client.containers.create(
                image=docker_image,
                command=command,
                detach=True,
                environment=docker_env,
                network=self._network,
            )

        except docker.errors.ImageNotFound:
            client.images.pull(docker_image)
            container = client.containers.create(
                image=docker_image,
                command=command,
                detach=True,
                environment=docker_env,
                network=self._network,
            )

        self._instance.report_engine_event(
            message=
            "Launching run in a new container {container_id} with image {docker_image}"
            .format(
                container_id=container.id,
                docker_image=docker_image,
            ),
            pipeline_run=run,
            cls=self.__class__,
        )

        self._instance.add_run_tags(
            run.run_id,
            {
                DOCKER_CONTAINER_ID_TAG: container.id,
                DOCKER_IMAGE_TAG: docker_image
            },
        )

        container.start()

        return run
Exemple #22
0
def repository_snapshot_command(**kwargs):
    recon_repo = recon_repo_for_cli_args(kwargs)
    definition = recon_repo.get_definition()

    active_data = external_repository_data_from_def(definition)
    click.echo(serialize_dagster_namedtuple(active_data))
Exemple #23
0
 def to_json(self):
     return serialize_dagster_namedtuple(self)