Beispiel #1
0
    def terminate(self, run_id):
        check.str_param(run_id, "run_id")
        run = self._instance.get_run_by_id(run_id)

        if not run:
            return False

        can_terminate = self.can_terminate(run_id)
        if not can_terminate:
            self._instance.report_engine_event(
                message="Unable to terminate run; can_terminate returned {}".format(can_terminate),
                pipeline_run=run,
                cls=self.__class__,
            )
            return False

        self._instance.report_run_canceling(run)

        job_name = get_job_name_from_run_id(
            run_id, resume_attempt_number=self._instance.count_resume_run_attempts(run.run_id)
        )

        try:
            termination_result = delete_job(job_name=job_name, namespace=self.job_namespace)
            if termination_result:
                self._instance.report_engine_event(
                    message="Run was terminated successfully.",
                    pipeline_run=run,
                    cls=self.__class__,
                )
            else:
                self._instance.report_engine_event(
                    message="Run was not terminated successfully; delete_job returned {}".format(
                        termination_result
                    ),
                    pipeline_run=run,
                    cls=self.__class__,
                )
            return termination_result
        except Exception:
            self._instance.report_engine_event(
                message="Run was not terminated successfully; encountered error in delete_job",
                pipeline_run=run,
                engine_event_data=EngineEventData.engine_error(
                    serializable_error_info_from_exc_info(sys.exc_info())
                ),
                cls=self.__class__,
            )
def _execute_schedule(graphene_info, external_pipeline, execution_params,
                      errors):
    check.inst_param(external_pipeline, 'external_pipeline', ExternalPipeline)

    instance = graphene_info.context.instance

    mode, environment_dict = execution_params.mode, execution_params.environment_dict

    validation_result = validate_config_from_snap(
        external_pipeline.config_schema_snapshot,
        external_pipeline.root_config_key_for_mode(mode),
        environment_dict,
    )
    if validation_result.success:
        external_execution_plan = graphene_info.context.get_external_execution_plan(
            external_pipeline, environment_dict, mode,
            execution_params.step_keys)

    pipeline_run = instance.create_run(
        pipeline_name=external_pipeline.name,
        environment_dict=environment_dict,
        mode=mode,
        solid_subset=(execution_params.selector.solid_subset
                      if execution_params.selector is not None else None),
        tags=execution_params.execution_metadata.tags,
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=external_execution_plan.
        execution_plan_snapshot,
    )

    # Inject errors into event log at this point
    if len(errors) > 0:
        for error in errors:
            instance.report_engine_event(error.message, pipeline_run,
                                         EngineEventData.engine_error(error))

    # Launch run if run launcher is defined
    run_launcher = graphene_info.context.instance.run_launcher
    if run_launcher:
        result = _launch_pipeline_execution_for_created_run(
            graphene_info, pipeline_run.run_id)
    else:
        result = _start_pipeline_execution_for_created_run(
            graphene_info, pipeline_run.run_id)

    return pipeline_run, result
Beispiel #3
0
def _execute_schedule(graphene_info, pipeline_def, execution_params, errors):

    instance = graphene_info.context.instance

    execution_plan = None
    if is_config_valid(pipeline_def, execution_params.environment_dict, execution_params.mode):
        execution_plan = create_execution_plan(
            pipeline_def, execution_params.environment_dict, mode=execution_params.mode,
        )

    execution_plan_snapshot = None
    if execution_plan:
        execution_plan_snapshot = snapshot_from_execution_plan(
            execution_plan, pipeline_def.get_pipeline_snapshot_id()
        )

    pipeline_run = instance.create_run(
        pipeline_name=pipeline_def.name,
        environment_dict=execution_params.environment_dict,
        mode=execution_params.mode,
        solid_subset=(
            execution_params.selector.solid_subset
            if execution_params.selector is not None
            else None
        ),
        tags=execution_params.execution_metadata.tags,
        pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
        execution_plan_snapshot=execution_plan_snapshot,
    )

    # Inject errors into event log at this point
    if len(errors) > 0:
        for error in errors:
            instance.report_engine_event(
                error.message, pipeline_run, EngineEventData.engine_error(error)
            )

    # Launch run if run launcher is defined
    run_launcher = graphene_info.context.instance.run_launcher
    if run_launcher:
        result = _launch_pipeline_execution_for_created_run(graphene_info, pipeline_run.run_id)
    else:
        result = _start_pipeline_execution_for_created_run(graphene_info, pipeline_run.run_id)

    return pipeline_run, result
Beispiel #4
0
def _execute_run_command_body(recon_pipeline, pipeline_run_id, instance,
                              write_stream_fn):

    # we need to send but the fact that we have loaded the args so the calling
    # process knows it is safe to clean up the temp input file
    write_stream_fn(ExecuteRunArgsLoadComplete())

    pipeline_run = instance.get_run_by_id(pipeline_run_id)

    pid = os.getpid()
    instance.report_engine_event(
        "Started process for pipeline (pid: {pid}).".format(pid=pid),
        pipeline_run,
        EngineEventData.in_process(pid, marker_end="cli_api_subprocess_init"),
    )

    # Perform setup so that termination of the execution will unwind and report to the
    # instance correctly
    setup_windows_interrupt_support()

    try:
        for event in execute_run_iterator(recon_pipeline, pipeline_run,
                                          instance):
            write_stream_fn(event)
    except (KeyboardInterrupt, DagsterExecutionInterruptedError):
        instance.report_engine_event(
            message="Pipeline execution terminated by interrupt",
            pipeline_run=pipeline_run,
        )
        _report_run_failed_if_not_finished(instance, pipeline_run_id)
    except Exception:  # pylint: disable=broad-except
        instance.report_engine_event(
            "An exception was thrown during execution that is likely a framework error, "
            "rather than an error in user code.",
            pipeline_run,
            EngineEventData.engine_error(
                serializable_error_info_from_exc_info(sys.exc_info())),
        )
        _report_run_failed_if_not_finished(instance, pipeline_run_id)
    finally:
        instance.report_engine_event(
            "Process for pipeline exited (pid: {pid}).".format(pid=pid),
            pipeline_run,
        )
Beispiel #5
0
def core_execute_run(recon_pipeline, pipeline_run, instance):
    check.inst_param(recon_pipeline, "recon_pipeline", ReconstructablePipeline)
    check.inst_param(pipeline_run, "pipeline_run", PipelineRun)
    check.inst_param(instance, "instance", DagsterInstance)

    try:
        yield from execute_run_iterator(recon_pipeline, pipeline_run, instance)
    except (KeyboardInterrupt, DagsterExecutionInterruptedError):
        yield from _report_run_failed_if_not_finished(instance, pipeline_run.run_id)
        yield instance.report_engine_event(
            message="Pipeline execution terminated by interrupt", pipeline_run=pipeline_run,
        )
    except Exception:  # pylint: disable=broad-except
        yield instance.report_engine_event(
            "An exception was thrown during execution that is likely a framework error, "
            "rather than an error in user code.",
            pipeline_run,
            EngineEventData.engine_error(serializable_error_info_from_exc_info(sys.exc_info())),
        )
        yield from _report_run_failed_if_not_finished(instance, pipeline_run.run_id)
Beispiel #6
0
    def start_run(self, execute_run_args):
        check.inst_param(execute_run_args, "execute_run_args", ExecuteExternalPipelineArgs)

        with DagsterInstance.from_ref(execute_run_args.instance_ref) as instance:
            try:
                res = self._query(
                    "StartRun",
                    api_pb2.StartRunRequest,
                    serialized_execute_run_args=serialize_dagster_namedtuple(execute_run_args),
                )
                return deserialize_json_to_dagster_namedtuple(res.serialized_start_run_result)

            except Exception:  # pylint: disable=bare-except
                pipeline_run = instance.get_run_by_id(execute_run_args.pipeline_run_id)
                instance.report_engine_event(
                    message="Unexpected error in IPC client",
                    pipeline_run=pipeline_run,
                    engine_event_data=EngineEventData.engine_error(
                        serializable_error_info_from_exc_info(sys.exc_info())
                    ),
                )
                raise
Beispiel #7
0
def _create_sensor_run(context, instance, repo_location, external_sensor,
                       external_pipeline, run_request):
    execution_plan_errors = []
    execution_plan_snapshot = None
    try:
        external_execution_plan = repo_location.get_external_execution_plan(
            external_pipeline,
            run_request.run_config,
            external_sensor.mode,
            step_keys_to_execute=None,
        )
        execution_plan_snapshot = external_execution_plan.execution_plan_snapshot
    except DagsterSubprocessError as e:
        execution_plan_errors.extend(e.subprocess_error_infos)
    except Exception as e:  # pylint: disable=broad-except
        execution_plan_errors.append(
            serializable_error_info_from_exc_info(sys.exc_info()))

    pipeline_tags = external_pipeline.tags or {}
    check_tags(pipeline_tags, "pipeline_tags")
    tags = merge_dicts(
        merge_dicts(pipeline_tags, run_request.tags),
        PipelineRun.tags_for_sensor(external_sensor),
    )
    if run_request.run_key:
        tags[RUN_KEY_TAG] = run_request.run_key

    run = instance.create_run(
        pipeline_name=external_sensor.pipeline_name,
        run_id=None,
        run_config=run_request.run_config,
        mode=external_sensor.mode,
        solids_to_execute=external_pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=external_sensor.solid_selection,
        status=(PipelineRunStatus.FAILURE if len(execution_plan_errors) > 0
                else PipelineRunStatus.NOT_STARTED),
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
        external_pipeline_origin=external_pipeline.get_external_origin(),
    )

    if len(execution_plan_errors) > 0:
        for error in execution_plan_errors:
            instance.report_engine_event(
                error.message,
                run,
                EngineEventData.engine_error(error),
            )
        instance.report_run_failed(run)
        context.logger.error(
            "Failed to fetch execution plan for {sensor_name}: {error_string}".
            format(
                sensor_name=external_sensor.name,
                error_string="\n".join(
                    [error.to_string() for error in execution_plan_errors]),
            ), )

    return run
Beispiel #8
0
def _launch_run(instance, repo_location, external_schedule, external_pipeline,
                tick_context, run_request):
    run_config = run_request.run_config
    schedule_tags = run_request.tags

    execution_plan_snapshot = None
    errors = []
    try:
        external_execution_plan = repo_location.get_external_execution_plan(
            external_pipeline,
            run_config,
            external_schedule.mode,
            step_keys_to_execute=None,
        )
        execution_plan_snapshot = external_execution_plan.execution_plan_snapshot
    except DagsterSubprocessError as e:
        errors.extend(e.subprocess_error_infos)
    except Exception as e:  # pylint: disable=broad-except
        errors.append(serializable_error_info_from_exc_info(sys.exc_info()))

    pipeline_tags = external_pipeline.tags or {}
    check_tags(pipeline_tags, "pipeline_tags")
    tags = merge_dicts(pipeline_tags, schedule_tags)

    # Enter the run in the DB with the information we have
    possibly_invalid_pipeline_run = instance.create_run(
        pipeline_name=external_schedule.pipeline_name,
        run_id=None,
        run_config=run_config,
        mode=external_schedule.mode,
        solids_to_execute=external_pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=external_pipeline.solid_selection,
        status=None,
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
        external_pipeline_origin=external_pipeline.get_external_origin(),
    )

    tick_context.add_run(run_id=possibly_invalid_pipeline_run.run_id,
                         run_key=run_request.run_key)

    # If there were errors, inject them into the event log and fail the run
    if len(errors) > 0:
        for error in errors:
            instance.report_engine_event(
                error.message,
                possibly_invalid_pipeline_run,
                EngineEventData.engine_error(error),
            )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        tick_context.stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id, errors=errors))
        return

    try:
        launched_run = instance.submit_run(
            possibly_invalid_pipeline_run.run_id, external_pipeline)
    except Exception:  # pylint: disable=broad-except
        tick_context.stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id,
                errors=[serializable_error_info_from_exc_info(sys.exc_info())],
            ))
        return

    tick_context.stream.send(
        ScheduledExecutionSuccess(run_id=launched_run.run_id))
Beispiel #9
0
def _launch_scheduled_execution(instance, repo_location, external_repo,
                                external_schedule, tick, stream):
    pipeline_selector = PipelineSelector(
        location_name=repo_location.name,
        repository_name=external_repo.name,
        pipeline_name=external_schedule.pipeline_name,
        solid_selection=external_schedule.solid_selection,
    )

    subset_pipeline_result = repo_location.get_subset_external_pipeline_result(
        pipeline_selector)
    external_pipeline = ExternalPipeline(
        subset_pipeline_result.external_pipeline_data,
        external_repo.handle,
    )

    schedule_execution_data = repo_location.get_external_schedule_execution_data(
        instance=instance,
        repository_handle=external_repo.handle,
        schedule_name=external_schedule.name,
        schedule_execution_data_mode=ScheduleExecutionDataMode.
        LAUNCH_SCHEDULED_EXECUTION,
        scheduled_execution_time=
        None,  # No way to know this in general for this scheduler
    )

    run_config = {}
    schedule_tags = {}
    execution_plan_snapshot = None
    errors = []

    if isinstance(schedule_execution_data, ExternalScheduleExecutionErrorData):
        error = schedule_execution_data.error
        tick.update_with_status(ScheduleTickStatus.FAILURE, error=error)
        stream.send(ScheduledExecutionFailed(run_id=None, errors=[error]))
        return
    elif not schedule_execution_data.should_execute:
        # Update tick to skipped state and return
        tick.update_with_status(ScheduleTickStatus.SKIPPED)
        stream.send(ScheduledExecutionSkipped())
        return
    else:
        run_config = schedule_execution_data.run_config
        schedule_tags = schedule_execution_data.tags
        try:
            external_execution_plan = repo_location.get_external_execution_plan(
                external_pipeline,
                run_config,
                external_schedule.mode,
                step_keys_to_execute=None,
            )
            execution_plan_snapshot = external_execution_plan.execution_plan_snapshot
        except DagsterSubprocessError as e:
            errors.extend(e.subprocess_error_infos)
        except Exception as e:  # pylint: disable=broad-except
            errors.append(serializable_error_info_from_exc_info(
                sys.exc_info()))

    pipeline_tags = external_pipeline.tags or {}
    check_tags(pipeline_tags, "pipeline_tags")
    tags = merge_dicts(pipeline_tags, schedule_tags)

    # Enter the run in the DB with the information we have
    possibly_invalid_pipeline_run = instance.create_run(
        pipeline_name=external_schedule.pipeline_name,
        run_id=None,
        run_config=run_config,
        mode=external_schedule.mode,
        solids_to_execute=external_pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=external_pipeline.solid_selection,
        status=None,
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
    )

    tick.update_with_status(ScheduleTickStatus.SUCCESS,
                            run_id=possibly_invalid_pipeline_run.run_id)

    # If there were errors, inject them into the event log and fail the run
    if len(errors) > 0:
        for error in errors:
            instance.report_engine_event(
                error.message,
                possibly_invalid_pipeline_run,
                EngineEventData.engine_error(error),
            )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id, errors=errors))
        return

    try:
        launched_run = instance.launch_run(
            possibly_invalid_pipeline_run.run_id, external_pipeline)
    except Exception:  # pylint: disable=broad-except
        stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id, errors=[error]))
        return

    stream.send(ScheduledExecutionSuccess(run_id=launched_run.run_id))
    return
Beispiel #10
0
def _start_pipeline_execution_for_created_run(graphene_info, run_id):
    check.inst_param(graphene_info, 'graphene_info', ResolveInfo)

    instance = graphene_info.context.instance
    execution_manager_settings = instance.dagit_settings.get(
        'execution_manager')
    if execution_manager_settings and execution_manager_settings.get(
            'disabled'):
        return graphene_info.schema.type_named(
            'StartPipelineRunDisabledError')()

    pipeline_run = instance.get_run_by_id(run_id)
    if not pipeline_run:
        return graphene_info.schema.type_named('PipelineRunNotFoundError')(
            run_id)

    pipeline_def = get_pipeline_def_from_selector(graphene_info,
                                                  pipeline_run.selector)

    environment_schema = create_environment_schema(pipeline_def,
                                                   pipeline_run.mode)
    validated_config = validate_config(environment_schema.environment_type,
                                       pipeline_run.environment_dict)
    if not validated_config.success:
        # If the config is invalid, we construct a DagsterInvalidConfigError exception and
        # insert it into the event log. We also return a PipelineConfigValidationInvalid user facing
        # graphql error.

        # We currently re-use the engine events machinery to add the error to the event log, but
        # may need to create a new event type and instance method to handle these erros.
        invalid_config_exception = DagsterInvalidConfigError(
            'Error in config for pipeline {}'.format(pipeline_def.name),
            validated_config.errors,
            pipeline_run.environment_dict,
        )

        instance.report_engine_event(
            str(invalid_config_exception.message),
            pipeline_run,
            EngineEventData.engine_error(
                SerializableErrorInfo(
                    invalid_config_exception.message,
                    [],
                    DagsterInvalidConfigError.__class__.__name__,
                    None,
                )),
        )

        instance.report_run_failed(pipeline_run)

        return DauphinPipelineConfigValidationInvalid.for_validation_errors(
            pipeline_def.get_pipeline_index(), validated_config.errors)

    graphene_info.context.execution_manager.execute_pipeline(
        graphene_info.context.get_handle(),
        pipeline_def,
        pipeline_run,
        instance=instance,
    )

    return graphene_info.schema.type_named('StartPipelineRunSuccess')(
        run=graphene_info.schema.type_named('PipelineRun')(pipeline_run))
Beispiel #11
0
def get_run_execution_info_for_created_run_or_error(
    graphene_info, repository_location_name, repository_name, run_id
):
    """
    Previously created run could either be created in a different process *or*
    during the launchScheduledRun call where we want to have a record of
    a run the was created but have invalid configuration
    """
    check.inst_param(graphene_info, "graphene_info", ResolveInfo)
    check.str_param(repository_location_name, "repository_location_name")
    check.str_param(repository_name, "repository_name")
    check.str_param(run_id, "run_id")

    instance = graphene_info.context.instance

    pipeline_run = instance.get_run_by_id(run_id)
    if not pipeline_run:
        return graphene_info.schema.type_named("PipelineRunNotFoundError")(run_id)

    external_pipeline = get_external_pipeline_or_raise(
        graphene_info,
        _get_selector_with_workaround(
            graphene_info.context, repository_location_name, repository_name, pipeline_run
        ),
    )

    validated_config = validate_config_from_snap(
        external_pipeline.config_schema_snapshot,
        external_pipeline.root_config_key_for_mode(pipeline_run.mode),
        pipeline_run.run_config,
    )

    if not validated_config.success:
        # If the config is invalid, we construct a DagsterInvalidConfigError exception and
        # insert it into the event log. We also return a PipelineConfigValidationInvalid user facing
        # graphql error.

        # We currently re-use the engine events machinery to add the error to the event log, but
        # may need to create a new event type and instance method to handle these errors.
        invalid_config_exception = DagsterInvalidConfigError(
            "Error in config for pipeline {}".format(external_pipeline.name),
            validated_config.errors,
            pipeline_run.run_config,
        )

        instance.report_engine_event(
            str(invalid_config_exception.message),
            pipeline_run,
            EngineEventData.engine_error(
                SerializableErrorInfo(
                    invalid_config_exception.message,
                    [],
                    DagsterInvalidConfigError.__class__.__name__,
                    None,
                )
            ),
        )

        instance.report_run_failed(pipeline_run)

        return DauphinPipelineConfigValidationInvalid.for_validation_errors(
            external_pipeline, validated_config.errors
        )

    return RunExecutionInfo(external_pipeline, pipeline_run)
Beispiel #12
0
def _launch_pipeline_execution_for_created_run(graphene_info, run_id):
    check.inst_param(graphene_info, 'graphene_info', ResolveInfo)
    check.str_param(run_id, 'run_id')

    # First retrieve the pipeline run
    instance = graphene_info.context.instance
    pipeline_run = instance.get_run_by_id(run_id)
    if not pipeline_run:
        return graphene_info.schema.type_named('PipelineRunNotFoundError')(
            run_id)

    external_pipeline = get_external_pipeline_or_raise(
        graphene_info, pipeline_run.selector.name,
        pipeline_run.selector.solid_subset)

    # Run config validation
    # If there are any config errors, then inject them into the event log
    validated_config = ensure_valid_config(external_pipeline,
                                           pipeline_run.mode,
                                           pipeline_run.environment_dict)

    if not validated_config.success:
        # If the config is invalid, we construct a DagsterInvalidConfigError exception and
        # insert it into the event log. We also return a PipelineConfigValidationInvalid user facing
        # graphql error.

        # We currently re-use the engine events machinery to add the error to the event log, but
        # may need to create a new event type and instance method to handle these errors.
        invalid_config_exception = DagsterInvalidConfigError(
            'Error in config for pipeline {}'.format(external_pipeline.name),
            validated_config.errors,
            pipeline_run.environment_dict,
        )

        instance.report_engine_event(
            str(invalid_config_exception.message),
            pipeline_run,
            EngineEventData.engine_error(
                SerializableErrorInfo(
                    invalid_config_exception.message,
                    [],
                    DagsterInvalidConfigError.__class__.__name__,
                    None,
                )),
        )

        instance.report_run_failed(pipeline_run)

        return DauphinPipelineConfigValidationInvalid.for_validation_errors(
            external_pipeline, validated_config.errors)

    try:
        pipeline_run = instance.launch_run(pipeline_run.run_id)
    except DagsterLaunchFailedError:
        error = serializable_error_info_from_exc_info(sys.exc_info())
        instance.report_engine_event(
            error.message,
            pipeline_run,
            EngineEventData.engine_error(error),
        )
        instance.report_run_failed(pipeline_run)

    return graphene_info.schema.type_named('LaunchPipelineRunSuccess')(
        run=graphene_info.schema.type_named('PipelineRun')(pipeline_run))
Beispiel #13
0
def _create_scheduler_run(
    instance,
    logger,
    schedule_time_utc,
    repo_location,
    external_repo,
    external_schedule,
    external_pipeline,
    tick_holder,
):
    schedule_execution_data = repo_location.get_external_schedule_execution_data(
        instance=instance,
        repository_handle=external_repo.handle,
        schedule_name=external_schedule.name,
        schedule_execution_data_mode=ScheduleExecutionDataMode.
        LAUNCH_SCHEDULED_EXECUTION,
        scheduled_execution_datetime_utc=schedule_time_utc,
    )

    if isinstance(schedule_execution_data, ExternalScheduleExecutionErrorData):
        error = schedule_execution_data.error
        logger.error(
            "Failed to fetch schedule data for {schedule_name}: {error}".
            format(schedule_name=external_schedule.name,
                   error=error.to_string()), )
        tick_holder.update_with_status(ScheduleTickStatus.FAILURE, error=error)
        return None
    elif not schedule_execution_data.should_execute:
        logger.info(
            "should_execute returned False for {schedule_name}, skipping".
            format(schedule_name=external_schedule.name))
        # Update tick to skipped state and return
        tick_holder.update_with_status(ScheduleTickStatus.SKIPPED)
        return None

    run_config = schedule_execution_data.run_config
    schedule_tags = schedule_execution_data.tags

    execution_plan_errors = []
    execution_plan_snapshot = None

    try:
        external_execution_plan = repo_location.get_external_execution_plan(
            external_pipeline,
            run_config,
            external_schedule.mode,
            step_keys_to_execute=None,
        )
        execution_plan_snapshot = external_execution_plan.execution_plan_snapshot
    except DagsterSubprocessError as e:
        execution_plan_errors.extend(e.subprocess_error_infos)
    except Exception as e:  # pylint: disable=broad-except
        execution_plan_errors.append(
            serializable_error_info_from_exc_info(sys.exc_info()))

    pipeline_tags = external_pipeline.tags or {}
    check_tags(pipeline_tags, "pipeline_tags")
    tags = merge_dicts(pipeline_tags, schedule_tags)

    tags[SCHEDULED_EXECUTION_TIME_TAG] = schedule_time_utc.isoformat()

    # If the run was scheduled correctly but there was an error creating its
    # run config, enter it into the run DB with a FAILURE status
    possibly_invalid_pipeline_run = instance.create_run(
        pipeline_name=external_schedule.pipeline_name,
        run_id=None,
        run_config=run_config,
        mode=external_schedule.mode,
        solids_to_execute=external_pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=external_pipeline.solid_selection,
        status=(PipelineRunStatus.FAILURE if len(execution_plan_errors) > 0
                else PipelineRunStatus.NOT_STARTED),
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
    )

    if len(execution_plan_errors) > 0:
        for error in execution_plan_errors:
            instance.report_engine_event(
                error.message,
                possibly_invalid_pipeline_run,
                EngineEventData.engine_error(error),
            )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        logger.error(
            "Failed to fetch execution plan for {schedule_name}: {error_string}"
            .format(
                schedule_name=external_schedule.name,
                error_string="\n".join(
                    [error.to_string() for error in execution_plan_errors]),
            ), )
    return possibly_invalid_pipeline_run
    def in_mp_process(cls, handle, pipeline_run, instance_ref, term_event):
        """
        Execute pipeline using message queue as a transport
        """
        run_id = pipeline_run.run_id
        pipeline_name = pipeline_run.pipeline_name

        instance = DagsterInstance.from_ref(instance_ref)
        pid = os.getpid()
        instance.report_engine_event(
            'Started process for pipeline (pid: {pid}).'.format(pid=pid),
            pipeline_run,
            EngineEventData.in_process(pid,
                                       marker_end='dagit_subprocess_init'),
            cls,
        )

        start_termination_thread(term_event)

        try:
            handle.build_repository_definition()
            pipeline_def = handle.with_pipeline_name(
                pipeline_name).build_pipeline_definition()
        except Exception:  # pylint: disable=broad-except
            instance.report_engine_event(
                'Failed attempting to load pipeline "{}"'.format(
                    pipeline_name),
                pipeline_run,
                EngineEventData.engine_error(
                    serializable_error_info_from_exc_info(sys.exc_info())),
                cls,
            )
            return

        try:
            event_list = []
            for event in execute_run_iterator(
                    pipeline_def.build_sub_pipeline(
                        pipeline_run.selector.solid_subset),
                    pipeline_run,
                    instance,
            ):
                event_list.append(event)
            return PipelineExecutionResult(pipeline_def, run_id, event_list,
                                           lambda: None)

        # Add a DagsterEvent for unexpected exceptions
        # Explicitly ignore KeyboardInterrupts since they are used for termination
        except DagsterSubprocessError as err:
            if not all([
                    err_info.cls_name == 'KeyboardInterrupt'
                    for err_info in err.subprocess_error_infos
            ]):
                instance.report_engine_event(
                    'An exception was thrown during execution that is likely a framework error, '
                    'rather than an error in user code.',
                    pipeline_run,
                    EngineEventData.engine_error(
                        serializable_error_info_from_exc_info(sys.exc_info())),
                    cls,
                )
        except Exception:  # pylint: disable=broad-except
            instance.report_engine_event(
                'An exception was thrown during execution that is likely a framework error, '
                'rather than an error in user code.',
                pipeline_run,
                EngineEventData.engine_error(
                    serializable_error_info_from_exc_info(sys.exc_info())),
                cls,
            )
        finally:
            instance.report_engine_event(
                'Process for pipeline exited (pid: {pid}).'.format(pid=pid),
                pipeline_run,
                cls=cls,
            )
Beispiel #15
0
    def execute(self, context):
        try:
            from dagster_graphql.client.mutations import (
                DagsterGraphQLClientError,
                handle_execution_errors,
                handle_execute_plan_result_raw,
            )

        except ImportError:
            raise AirflowException(
                'To use the DagsterDockerOperator, dagster and dagster_graphql must be installed '
                'in your Airflow environment.')

        if 'run_id' in self.params:
            self._run_id = self.params['run_id']
        elif 'dag_run' in context and context['dag_run'] is not None:
            self._run_id = context['dag_run'].run_id

        try:
            if self.instance:
                run = self.instance.register_managed_run(
                    pipeline_name=self.pipeline_name,
                    run_id=self.run_id,
                    environment_dict=self.environment_dict,
                    mode=self.mode,
                    solids_to_execute=None,
                    step_keys_to_execute=None,
                    tags=None,
                    root_run_id=None,
                    parent_run_id=None,
                    pipeline_snapshot=self.pipeline_snapshot,
                    execution_plan_snapshot=self.execution_plan_snapshot,
                    parent_pipeline_snapshot=self.parent_pipeline_snapshot,
                )

            raw_res = super(DagsterDockerOperator, self).execute(context)
            self.log.info('Finished executing container.')

            res = parse_raw_log_lines(raw_res)

            try:
                handle_execution_errors(res, 'executePlan')
            except DagsterGraphQLClientError as err:
                if self.instance:
                    self.instance.report_engine_event(
                        str(err),
                        run,
                        EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info())),
                        self.__class__,
                    )
                raise

            events = handle_execute_plan_result_raw(res)

            if self.instance:
                for event in events:
                    self.instance.handle_new_event(event)

            events = [e.dagster_event for e in events]
            check_events_for_failures(events)
            check_events_for_skips(events)

            return events

        finally:
            self._run_id = None
    def execute(self, context):
        try:
            from dagster_graphql.client.mutations import (
                DagsterGraphQLClientError,
                handle_execution_errors,
                handle_execute_plan_result_raw,
            )

        except ImportError:
            raise AirflowException(
                'To use the DagsterKubernetesPodOperator, dagster and dagster_graphql must be'
                ' installed in your Airflow environment.'
            )

        if 'run_id' in self.params:
            self._run_id = self.params['run_id']
        elif 'dag_run' in context and context['dag_run'] is not None:
            self._run_id = context['dag_run'].run_id

        # return to original execute code:
        try:
            client = kube_client.get_kube_client(
                in_cluster=self.in_cluster,
                cluster_context=self.cluster_context,
                config_file=self.config_file,
            )
            gen = pod_generator.PodGenerator()

            for mount in self.volume_mounts:
                gen.add_mount(mount)
            for volume in self.volumes:
                gen.add_volume(volume)

            pod = gen.make_pod(
                namespace=self.namespace,
                image=self.image,
                pod_id=self.name,
                cmds=self.cmds,
                arguments=self.query(context.get('ts')),
                labels=self.labels,
            )

            pod.service_account_name = self.service_account_name
            pod.secrets = self.secrets
            pod.envs = self.env_vars
            pod.image_pull_policy = self.image_pull_policy
            pod.image_pull_secrets = self.image_pull_secrets
            pod.annotations = self.annotations
            pod.resources = self.resources
            pod.affinity = self.affinity
            pod.node_selectors = self.node_selectors
            pod.hostnetwork = self.hostnetwork
            pod.tolerations = self.tolerations
            pod.configmaps = self.configmaps
            pod.security_context = self.security_context

            launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.xcom_push)
            try:
                if self.instance:
                    tags = (
                        {AIRFLOW_EXECUTION_DATE_STR: context.get('ts')} if 'ts' in context else {}
                    )

                    run = self.instance.register_managed_run(
                        pipeline_name=self.pipeline_name,
                        run_id=self.run_id,
                        run_config=self.run_config,
                        mode=self.mode,
                        solids_to_execute=None,
                        step_keys_to_execute=None,
                        tags=tags,
                        root_run_id=None,
                        parent_run_id=None,
                        pipeline_snapshot=self.pipeline_snapshot,
                        execution_plan_snapshot=self.execution_plan_snapshot,
                        parent_pipeline_snapshot=self.parent_pipeline_snapshot,
                    )

                # we won't use the "result", which is the pod's xcom json file
                (final_state, _) = launcher.run_pod(
                    pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs
                )

                # fetch the last line independently of whether logs were read
                # unbelievably, if you set tail_lines=1, the returned json has its double quotes
                # turned into unparseable single quotes
                res = None
                num_attempts = 0
                while not res and num_attempts < LOG_RETRIEVAL_MAX_ATTEMPTS:
                    raw_res = client.read_namespaced_pod_log(
                        name=pod.name, namespace=pod.namespace, container='base'
                    )
                    res = parse_raw_log_lines(raw_res.split('\n'))
                    time.sleep(LOG_RETRIEVAL_WAITS_BETWEEN_ATTEMPTS_SEC)
                    num_attempts += 1

                try:
                    handle_execution_errors(res, 'executePlan')
                except DagsterGraphQLClientError as err:
                    self.instance.report_engine_event(
                        str(err),
                        run,
                        EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(sys.exc_info())
                        ),
                        self.__class__,
                    )
                    raise

                events = handle_execute_plan_result_raw(res)

                if self.instance:
                    for event in events:
                        self.instance.handle_new_event(event)

                events = [e.dagster_event for e in events]
                check_events_for_failures(events)
                check_events_for_skips(events)
                return events

            finally:
                self._run_id = None

                if self.is_delete_operator_pod:
                    launcher.delete_pod(pod)

            if final_state != State.SUCCESS:
                raise AirflowException('Pod returned a failure: {state}'.format(state=final_state))
            # note the lack of returning the default xcom
        except AirflowException as ex:
            raise AirflowException('Pod Launching failed: {error}'.format(error=ex))
Beispiel #17
0
def _core_celery_execution_loop(pipeline_context, execution_plan, step_execution_fn):
    from .tasks import make_app

    check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext)
    check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
    check.callable_param(step_execution_fn, 'step_execution_fn')

    check.param_invariant(
        isinstance(pipeline_context.executor_config, (CeleryConfig, CeleryK8sJobConfig)),
        'pipeline_context',
        'Expected executor_config to be Celery config got {}'.format(
            pipeline_context.executor_config
        ),
    )

    celery_config = pipeline_context.executor_config

    # https://github.com/dagster-io/dagster/issues/2440
    check.invariant(
        pipeline_context.system_storage_def.is_persistent,
        'Cannot use in-memory storage with Celery, use filesystem (on top of NFS or '
        'similar system that allows files to be available to all nodes), S3, or GCS',
    )

    app = make_app(celery_config)

    priority_for_step = lambda step: (
        -1 * int(step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority))
        + -1 * _get_run_priority(pipeline_context)
    )
    priority_for_key = lambda step_key: (
        priority_for_step(execution_plan.get_step_by_key(step_key))
    )
    _warn_on_priority_misuse(pipeline_context, execution_plan)

    step_results = {}  # Dict[ExecutionStep, celery.AsyncResult]
    step_errors = {}
    completed_steps = set({})  # Set[step_key]
    active_execution = execution_plan.start(
        retries=pipeline_context.executor_config.retries, sort_key_fn=priority_for_step
    )
    stopping = False

    while (not active_execution.is_complete and not stopping) or step_results:

        results_to_pop = []
        for step_key, result in sorted(step_results.items(), key=lambda x: priority_for_key(x[0])):
            if result.ready():
                try:
                    step_events = result.get()
                except Exception:  # pylint: disable=broad-except
                    # We will want to do more to handle the exception here.. maybe subclass Task
                    # Certainly yield an engine or pipeline event
                    step_events = []
                    step_errors[step_key] = serializable_error_info_from_exc_info(sys.exc_info())
                    stopping = True
                for step_event in step_events:
                    event = deserialize_json_to_dagster_namedtuple(step_event)
                    yield event
                    active_execution.handle_event(event)

                results_to_pop.append(step_key)
                completed_steps.add(step_key)

        for step_key in results_to_pop:
            if step_key in step_results:
                del step_results[step_key]
                active_execution.verify_complete(pipeline_context, step_key)

        # process skips from failures or uncovered inputs
        for event in active_execution.skipped_step_events_iterator(pipeline_context):
            yield event

        # don't add any new steps if we are stopping
        if stopping:
            continue

        # This is a slight refinement. If we have n workers idle and schedule m > n steps for
        # execution, the first n steps will be picked up by the idle workers in the order in
        # which they are scheduled (and the following m-n steps will be executed in priority
        # order, provided that it takes longer to execute a step than to schedule it). The test
        # case has m >> n to exhibit this behavior in the absence of this sort step.
        for step in active_execution.get_steps_to_execute():
            try:
                queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG, task_default_queue)
                yield DagsterEvent.engine_event(
                    pipeline_context,
                    'Submitting celery task for step "{step_key}" to queue "{queue}".'.format(
                        step_key=step.key, queue=queue
                    ),
                    EngineEventData(marker_start=DELEGATE_MARKER),
                    step_key=step.key,
                )

                # Get the Celery priority for this step
                priority = _get_step_priority(pipeline_context, step)

                # Submit the Celery tasks
                step_results[step.key] = step_execution_fn(
                    app, pipeline_context, step, queue, priority
                )

            except Exception:
                yield DagsterEvent.engine_event(
                    pipeline_context,
                    'Encountered error during celery task submission.'.format(),
                    event_specific_data=EngineEventData.engine_error(
                        serializable_error_info_from_exc_info(sys.exc_info()),
                    ),
                )
                raise

        time.sleep(TICK_SECONDS)

    if step_errors:
        raise DagsterSubprocessError(
            'During celery execution errors occurred in workers:\n{error_list}'.format(
                error_list='\n'.join(
                    [
                        '[{step}]: {err}'.format(step=key, err=err.to_string())
                        for key, err in step_errors.items()
                    ]
                )
            ),
            subprocess_error_infos=list(step_errors.values()),
        )
Beispiel #18
0
    def execute(pipeline_context, execution_plan):
        from .tasks import make_app

        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        check.param_invariant(
            isinstance(pipeline_context.executor_config, CeleryConfig),
            'pipeline_context',
            'Expected executor_config to be CeleryConfig got {}'.format(
                pipeline_context.executor_config),
        )

        celery_config = pipeline_context.executor_config

        storage = pipeline_context.environment_dict.get('storage')

        if (celery_config.broker and not is_local_uri(celery_config.broker)
            ) or (celery_config.backend
                  and not is_local_uri(celery_config.backend)):
            check.invariant(
                storage.get('s3') or storage.get('gcs'),
                'Must use S3 or GCS storage with non-local Celery broker: {broker} '
                'and backend: {backend}'.format(broker=celery_config.broker,
                                                backend=celery_config.backend),
            )
        else:
            check.invariant(
                not storage.get('in_memory'),
                'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS',
            )

        app = make_app(celery_config)

        priority_for_step = lambda step: (-1 * int(
            step.tags.get('dagster-celery/priority', task_default_priority)
        ) + -1 * _get_run_priority(pipeline_context))
        priority_for_key = lambda step_key: (priority_for_step(
            execution_plan.get_step_by_key(step_key)))
        _warn_on_priority_misuse(pipeline_context, execution_plan)

        step_results = {}  # Dict[ExecutionStep, celery.AsyncResult]
        step_errors = {}
        completed_steps = set({})  # Set[step_key]
        active_execution = execution_plan.start(
            retries=pipeline_context.executor_config.retries,
            sort_key_fn=priority_for_step)
        stopping = False

        while (not active_execution.is_complete
               and not stopping) or step_results:

            results_to_pop = []
            for step_key, result in sorted(
                    step_results.items(),
                    key=lambda x: priority_for_key(x[0])):
                if result.ready():
                    try:
                        step_events = result.get()
                    except Exception as e:  # pylint: disable=broad-except
                        # We will want to do more to handle the exception here.. maybe subclass Task
                        # Certainly yield an engine or pipeline event
                        step_events = []
                        step_errors[
                            step_key] = serializable_error_info_from_exc_info(
                                sys.exc_info())
                        stopping = True
                    for step_event in step_events:
                        event = deserialize_json_to_dagster_namedtuple(
                            step_event)
                        yield event
                        active_execution.handle_event(event)

                    results_to_pop.append(step_key)
                    completed_steps.add(step_key)

            for step_key in results_to_pop:
                if step_key in step_results:
                    del step_results[step_key]
                    active_execution.verify_complete(pipeline_context,
                                                     step_key)

            # process skips from failures or uncovered inputs
            for event in active_execution.skipped_step_events_iterator(
                    pipeline_context):
                yield event

            # don't add any new steps if we are stopping
            if stopping:
                continue

            # This is a slight refinement. If we have n workers idle and schedule m > n steps for
            # execution, the first n steps will be picked up by the idle workers in the order in
            # which they are scheduled (and the following m-n steps will be executed in priority
            # order, provided that it takes longer to execute a step than to schedule it). The test
            # case has m >> n to exhibit this behavior in the absence of this sort step.
            for step in active_execution.get_steps_to_execute():
                try:
                    queue = step.tags.get('dagster-celery/queue',
                                          task_default_queue)
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Submitting celery task for step "{step_key}" to queue "{queue}".'
                        .format(step_key=step.key, queue=queue),
                        EngineEventData(marker_start=DELEGATE_MARKER),
                        step_key=step.key,
                    )
                    step_results[step.key] = _submit_task(
                        app, pipeline_context, step, queue)
                except Exception:
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Encountered error during celery task submission.'.
                        format(),
                        event_specific_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info()), ),
                    )
                    raise

            time.sleep(TICK_SECONDS)

        if step_errors:
            raise DagsterSubprocessError(
                'During celery execution errors occurred in workers:\n{error_list}'
                .format(error_list='\n'.join([
                    '[{step}]: {err}'.format(step=key, err=err.to_string())
                    for key, err in step_errors.items()
                ])),
                subprocess_error_infos=list(step_errors.values()),
            )
Beispiel #19
0
def _execute_run(request):
    try:
        execute_run_args = deserialize_json_to_dagster_namedtuple(
            request.serialized_execute_run_args)
        check.inst_param(execute_run_args, 'execute_run_args', ExecuteRunArgs)

        recon_pipeline = recon_pipeline_from_origin(
            execute_run_args.pipeline_origin)

        instance = DagsterInstance.from_ref(execute_run_args.instance_ref)
        pipeline_run = instance.get_run_by_id(execute_run_args.pipeline_run_id)

        pid = os.getpid()

    except:  # pylint: disable=bare-except
        yield IPCErrorMessage(
            serializable_error_info=serializable_error_info_from_exc_info(
                sys.exc_info()),
            message='Error during RPC setup for ExecuteRun',
        )
        return

    yield instance.report_engine_event(
        'Started process for pipeline (pid: {pid}).'.format(pid=pid),
        pipeline_run,
        EngineEventData.in_process(pid, marker_end='cli_api_subprocess_init'),
    )

    # This is so nasty but seemingly unavoidable
    # https://amir.rachum.com/blog/2017/03/03/generator-cleanup/
    closed = False
    try:
        for event in execute_run_iterator(recon_pipeline, pipeline_run,
                                          instance):
            yield event
    except DagsterSubprocessError as err:
        if not all([
                err_info.cls_name == 'KeyboardInterrupt'
                for err_info in err.subprocess_error_infos
        ]):
            yield instance.report_engine_event(
                'An exception was thrown during execution that is likely a framework error, '
                'rather than an error in user code.',
                pipeline_run,
                EngineEventData.engine_error(
                    serializable_error_info_from_exc_info(sys.exc_info())),
            )
            instance.report_run_failed(pipeline_run)
    except GeneratorExit:
        closed = True
        raise
    except Exception:  # pylint: disable=broad-except
        yield instance.report_engine_event(
            'An exception was thrown during execution that is likely a framework error, '
            'rather than an error in user code.',
            pipeline_run,
            EngineEventData.engine_error(
                serializable_error_info_from_exc_info(sys.exc_info())),
        )
        instance.report_run_failed(pipeline_run)
    finally:
        if not closed:
            yield instance.report_engine_event(
                'Process for pipeline exited (pid: {pid}).'.format(pid=pid),
                pipeline_run,
            )
def core_celery_execution_loop(pipeline_context, execution_plan,
                               step_execution_fn):

    check.inst_param(pipeline_context, "pipeline_context",
                     SystemPipelineExecutionContext)
    check.inst_param(execution_plan, "execution_plan", ExecutionPlan)
    check.callable_param(step_execution_fn, "step_execution_fn")

    executor = pipeline_context.executor

    # https://github.com/dagster-io/dagster/issues/2440
    check.invariant(
        execution_plan.artifacts_persisted,
        "Cannot use in-memory storage with Celery, use filesystem (on top of NFS or "
        "similar system that allows files to be available to all nodes), S3, or GCS",
    )

    app = make_app(executor.app_args())

    priority_for_step = lambda step: (-1 * int(
        step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority)
    ) + -1 * _get_run_priority(pipeline_context))
    priority_for_key = lambda step_key: (priority_for_step(
        execution_plan.get_step_by_key(step_key)))
    _warn_on_priority_misuse(pipeline_context, execution_plan)

    step_results = {}  # Dict[ExecutionStep, celery.AsyncResult]
    step_errors = {}

    with execution_plan.start(
            retries=pipeline_context.executor.retries,
            sort_key_fn=priority_for_step,
    ) as active_execution:

        stopping = False

        while (not active_execution.is_complete
               and not stopping) or step_results:
            if active_execution.check_for_interrupts():
                yield DagsterEvent.engine_event(
                    pipeline_context,
                    "Celery executor: received termination signal - revoking active tasks from workers",
                    EngineEventData.interrupted(list(step_results.keys())),
                )
                stopping = True
                active_execution.mark_interrupted()
                for result in step_results.values():
                    result.revoke()
            results_to_pop = []
            for step_key, result in sorted(
                    step_results.items(),
                    key=lambda x: priority_for_key(x[0])):
                if result.ready():
                    try:
                        step_events = result.get()
                    except TaskRevokedError:
                        step_events = []
                        yield DagsterEvent.engine_event(
                            pipeline_context,
                            'celery task for running step "{step_key}" was revoked.'
                            .format(step_key=step_key, ),
                            EngineEventData(marker_end=DELEGATE_MARKER),
                            step_handle=active_execution.get_step_by_key(
                                step_key).handle,
                        )
                    except Exception:  # pylint: disable=broad-except
                        # We will want to do more to handle the exception here.. maybe subclass Task
                        # Certainly yield an engine or pipeline event
                        step_events = []
                        step_errors[
                            step_key] = serializable_error_info_from_exc_info(
                                sys.exc_info())
                    for step_event in step_events:
                        event = deserialize_json_to_dagster_namedtuple(
                            step_event)
                        yield event
                        active_execution.handle_event(event)

                    results_to_pop.append(step_key)

            for step_key in results_to_pop:
                if step_key in step_results:
                    del step_results[step_key]
                    active_execution.verify_complete(pipeline_context,
                                                     step_key)

            # process skips from failures or uncovered inputs
            for event in active_execution.plan_events_iterator(
                    pipeline_context):
                yield event

            # don't add any new steps if we are stopping
            if stopping or step_errors:
                continue

            # This is a slight refinement. If we have n workers idle and schedule m > n steps for
            # execution, the first n steps will be picked up by the idle workers in the order in
            # which they are scheduled (and the following m-n steps will be executed in priority
            # order, provided that it takes longer to execute a step than to schedule it). The test
            # case has m >> n to exhibit this behavior in the absence of this sort step.
            for step in active_execution.get_steps_to_execute():
                try:
                    queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG,
                                          task_default_queue)
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Submitting celery task for step "{step_key}" to queue "{queue}".'
                        .format(step_key=step.key, queue=queue),
                        EngineEventData(marker_start=DELEGATE_MARKER),
                        step_handle=step.handle,
                    )

                    # Get the Celery priority for this step
                    priority = _get_step_priority(pipeline_context, step)

                    # Submit the Celery tasks
                    step_results[step.key] = step_execution_fn(
                        app, pipeline_context, step, queue, priority)

                except Exception:
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        "Encountered error during celery task submission.".
                        format(),
                        event_specific_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info()), ),
                    )
                    raise

            time.sleep(TICK_SECONDS)

        if step_errors:
            raise DagsterSubprocessError(
                "During celery execution errors occurred in workers:\n{error_list}"
                .format(error_list="\n".join([
                    "[{step}]: {err}".format(step=key, err=err.to_string())
                    for key, err in step_errors.items()
                ])),
                subprocess_error_infos=list(step_errors.values()),
            )
Beispiel #21
0
    def execute(pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        check.param_invariant(
            isinstance(pipeline_context.executor_config, CeleryConfig),
            'pipeline_context',
            'Expected executor_config to be CeleryConfig got {}'.format(
                pipeline_context.executor_config),
        )

        celery_config = pipeline_context.executor_config

        storage = pipeline_context.environment_dict.get('storage')

        if (celery_config.broker and not is_local_uri(celery_config.broker)
            ) or (celery_config.backend
                  and not is_local_uri(celery_config.backend)):
            check.invariant(
                storage.get('s3') or storage.get('gcs'),
                'Must use S3 or GCS storage with non-local Celery broker: {broker} '
                'and backend: {backend}'.format(broker=celery_config.broker,
                                                backend=celery_config.backend),
            )
        else:
            check.invariant(
                not storage.get('in_memory'),
                'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS',
            )

        pipeline_name = pipeline_context.pipeline_def.name

        handle_dict = pipeline_context.execution_target_handle.to_dict()

        instance_ref_dict = pipeline_context.instance.get_ref().to_dict()

        environment_dict = dict(pipeline_context.environment_dict,
                                execution={'in_process': {}})

        mode = pipeline_context.mode_def.name

        run_id = pipeline_context.pipeline_run.run_id

        app = make_app(celery_config)

        task_signatures = {}  # Dict[step_key, celery.Signature]
        apply_kwargs = defaultdict(dict)  # Dict[step_key, Dict[str, Any]]

        priority_for_step = lambda step: (-1 * int(
            step.tags.get('dagster-celery/priority', task_default_priority)))
        priority_for_key = lambda step_key: (-1 * apply_kwargs[step_key][
            'priority'])
        _warn_on_priority_misuse(pipeline_context, execution_plan)

        for step_key in execution_plan.step_keys_to_execute:
            step = execution_plan.get_step_by_key(step_key)
            priority = int(
                step.tags.get('dagster-celery/priority',
                              task_default_priority))
            queue = step.tags.get('dagster-celery/queue', task_default_queue)
            task = create_task(app)

            variables = {
                'executionParams': {
                    'selector': {
                        'name': pipeline_name
                    },
                    'environmentConfigData': environment_dict,
                    'mode': mode,
                    'executionMetadata': {
                        'runId': run_id
                    },
                    'stepKeys': [step_key],
                }
            }
            task_signatures[step_key] = task.si(handle_dict, variables,
                                                instance_ref_dict)
            apply_kwargs[step_key] = {
                'priority': priority,
                'queue': queue,
                'routing_key': '{queue}.execute_query'.format(queue=queue),
            }

        step_results = {}  # Dict[ExecutionStep, celery.AsyncResult]
        step_success = {}
        step_errors = {}
        completed_steps = set({})  # Set[step_key]
        active_execution = execution_plan.start(sort_key_fn=priority_for_step)
        stopping = False

        while (not active_execution.is_complete
               and not stopping) or step_results:

            results_to_pop = []
            for step_key, result in sorted(
                    step_results.items(),
                    key=lambda x: priority_for_key(x[0])):
                if result.ready():
                    try:
                        step_events = result.get()
                    except Exception as e:  # pylint: disable=broad-except
                        # We will want to do more to handle the exception here.. maybe subclass Task
                        # Certainly yield an engine or pipeline event
                        step_events = []
                        step_errors[
                            step_key] = serializable_error_info_from_exc_info(
                                sys.exc_info())
                        stopping = True
                    for step_event in step_events:
                        event = deserialize_json_to_dagster_namedtuple(
                            step_event)
                        yield event
                        if event.is_step_success:
                            step_success[step_key] = True
                        elif event.is_step_failure:
                            step_success[step_key] = False

                    results_to_pop.append(step_key)
                    completed_steps.add(step_key)

            for step_key in results_to_pop:
                if step_key in step_results:
                    del step_results[step_key]
                    was_success = step_success.get(step_key)
                    if was_success == True:
                        active_execution.mark_success(step_key)
                    elif was_success == False:
                        active_execution.mark_failed(step_key)
                    else:
                        # check errors list?
                        pipeline_context.log.error(
                            'Step {key} finished without success or failure event, assuming failure.'
                            .format(key=step_key))
                        active_execution.mark_failed(step_key)

            # process skips from failures or uncovered inputs
            for event in active_execution.skipped_step_events_iterator(
                    pipeline_context):
                yield event

            # dont add any new steps if we are stopping
            if stopping:
                continue

            # This is a slight refinement. If we have n workers idle and schedule m > n steps for
            # execution, the first n steps will be picked up by the idle workers in the order in
            # which they are scheduled (and the following m-n steps will be executed in priority
            # order, provided that it takes longer to execute a step than to schedule it). The test
            # case has m >> n to exhibit this behavior in the absence of this sort step.
            for step in active_execution.get_steps_to_execute():
                try:
                    step_results[step.key] = task_signatures[
                        step.key].apply_async(**apply_kwargs[step.key])
                except Exception:
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Encountered error during celery task submission.'.
                        format(),
                        event_specific_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info()), ),
                    )
                    raise

            time.sleep(TICK_SECONDS)

        if step_errors:
            raise DagsterSubprocessError(
                'During celery execution errors occured in workers:\n{error_list}'
                .format(error_list='\n'.join([
                    '[{step}]: {err}'.format(step=key, err=err.to_string())
                    for key, err in step_errors.items()
                ])),
                subprocess_error_infos=list(step_errors.values()),
            )
Beispiel #22
0
def _schedule_run_at_time(
    instance,
    logger,
    repo_location,
    schedule_state,
    schedule_time_utc,
    tick_holder,
    debug_crash_flags,
):
    schedule_name = schedule_state.name

    repo_dict = repo_location.get_repositories()
    check.invariant(
        len(repo_dict) == 1,
        "Reconstructed repository location should have exactly one repository",
    )
    external_repo = next(iter(repo_dict.values()))

    external_schedule = external_repo.get_external_schedule(schedule_name)

    pipeline_selector = PipelineSelector(
        location_name=repo_location.name,
        repository_name=external_repo.name,
        pipeline_name=external_schedule.pipeline_name,
        solid_selection=external_schedule.solid_selection,
    )

    subset_pipeline_result = repo_location.get_subset_external_pipeline_result(
        pipeline_selector)
    external_pipeline = ExternalPipeline(
        subset_pipeline_result.external_pipeline_data,
        external_repo.handle,
    )

    # Rule out the case where the scheduler crashed between creating a run for this time
    # and launching it
    runs_filter = PipelineRunsFilter(tags=merge_dicts(
        PipelineRun.tags_for_schedule(schedule_state),
        {SCHEDULED_EXECUTION_TIME_TAG: schedule_time_utc.isoformat()},
    ))
    existing_runs = instance.get_runs(runs_filter)

    run_to_launch = None

    if len(existing_runs):
        check.invariant(len(existing_runs) == 1)

        run = existing_runs[0]

        if run.status != PipelineRunStatus.NOT_STARTED:
            # A run already exists and was launched for this time period,
            # but the scheduler must have crashed before the tick could be put
            # into a SUCCESS state

            logger.info(
                "Run {run_id} already completed for this execution of {schedule_name}"
                .format(run_id=run.run_id, schedule_name=schedule_state.name))
            tick_holder.update_with_status(ScheduleTickStatus.SUCCESS,
                                           run_id=run.run_id)

            return
        else:
            logger.info(
                "Run {run_id} already created for this execution of {schedule_name}"
                .format(run_id=run.run_id, schedule_name=schedule_state.name))
            run_to_launch = run
    else:
        run_to_launch = _create_scheduler_run(
            instance,
            logger,
            schedule_time_utc,
            repo_location,
            external_repo,
            external_schedule,
            external_pipeline,
            tick_holder,
        )

        _check_for_debug_crash(debug_crash_flags, "RUN_CREATED")

    if not run_to_launch:
        check.invariant(tick_holder.status != ScheduleTickStatus.STARTED
                        and tick_holder.status != ScheduleTickStatus.SUCCESS)
        return

    if run_to_launch.status != PipelineRunStatus.FAILURE:
        try:
            instance.launch_run(run_to_launch.run_id, external_pipeline)
            logger.info(
                "Completed scheduled launch of run {run_id} for {schedule_name}"
                .format(run_id=run_to_launch.run_id,
                        schedule_name=schedule_name))
        except Exception as e:  # pylint: disable=broad-except
            if not isinstance(e, KeyboardInterrupt):
                error = serializable_error_info_from_exc_info(sys.exc_info())
                instance.report_engine_event(
                    error.message,
                    run_to_launch,
                    EngineEventData.engine_error(error),
                )
                instance.report_run_failed(run_to_launch)
                logger.error(
                    "Run {run_id} created successfully but failed to launch.".
                    format(run_id=run_to_launch.run_id))

    _check_for_debug_crash(debug_crash_flags, "RUN_LAUNCHED")

    tick_holder.update_with_status(ScheduleTickStatus.SUCCESS,
                                   run_id=run_to_launch.run_id)
    _check_for_debug_crash(debug_crash_flags, "TICK_SUCCESS")
Beispiel #23
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, "pipeline_context",
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, "execution_plan", ExecutionPlan)

        limit = self.max_concurrent

        yield DagsterEvent.engine_event(
            pipeline_context,
            "Executing steps using multiprocess executor: parent process (pid: {pid})"
            .format(pid=os.getpid()),
            event_specific_data=EngineEventData.multiprocess(
                os.getpid(),
                step_keys_to_execute=execution_plan.step_keys_to_execute),
        )

        # It would be good to implement a reference tracking algorithm here so we could
        # garbage collect results that are no longer needed by any steps
        # https://github.com/dagster-io/dagster/issues/811
        with time_execution_scope() as timer_result:
            with execution_plan.start(
                    retry_mode=self.retries) as active_execution:
                active_iters = {}
                errors = {}
                term_events = {}
                stopping = False

                while (not stopping
                       and not active_execution.is_complete) or active_iters:
                    if active_execution.check_for_interrupts():
                        yield DagsterEvent.engine_event(
                            pipeline_context,
                            "Multiprocess executor: received termination signal - "
                            "forwarding to active child processes",
                            EngineEventData.interrupted(
                                list(term_events.keys())),
                        )
                        stopping = True
                        active_execution.mark_interrupted()
                        for key, event in term_events.items():
                            event.set()

                    # start iterators
                    while len(active_iters) < limit and not stopping:
                        steps = active_execution.get_steps_to_execute(
                            limit=(limit - len(active_iters)))

                        if not steps:
                            break

                        for step in steps:
                            step_context = pipeline_context.for_step(step)
                            term_events[step.key] = multiprocessing.Event()
                            active_iters[
                                step.key] = self.execute_step_out_of_process(
                                    step_context,
                                    step,
                                    errors,
                                    term_events,
                                    active_execution.get_known_state(),
                                )

                    # process active iterators
                    empty_iters = []
                    for key, step_iter in active_iters.items():
                        try:
                            event_or_none = next(step_iter)
                            if event_or_none is None:
                                continue
                            else:
                                yield event_or_none
                                active_execution.handle_event(event_or_none)

                        except ChildProcessCrashException as crash:
                            serializable_error = serializable_error_info_from_exc_info(
                                sys.exc_info())
                            yield DagsterEvent.engine_event(
                                pipeline_context,
                                ("Multiprocess executor: child process for step {step_key} "
                                 "unexpectedly exited with code {exit_code}"
                                 ).format(step_key=key,
                                          exit_code=crash.exit_code),
                                EngineEventData.engine_error(
                                    serializable_error),
                                step_handle=active_execution.get_step_by_key(
                                    key).handle,
                            )
                            step_failure_event = DagsterEvent.step_failure_event(
                                step_context=pipeline_context.for_step(
                                    active_execution.get_step_by_key(key)),
                                step_failure_data=StepFailureData(
                                    error=serializable_error,
                                    user_failure_data=None),
                            )
                            active_execution.handle_event(step_failure_event)
                            yield step_failure_event
                            empty_iters.append(key)
                        except StopIteration:
                            empty_iters.append(key)

                    # clear and mark complete finished iterators
                    for key in empty_iters:
                        del active_iters[key]
                        del term_events[key]
                        active_execution.verify_complete(pipeline_context, key)

                    # process skipped and abandoned steps
                    yield from active_execution.plan_events_iterator(
                        pipeline_context)

                errs = {pid: err for pid, err in errors.items() if err}

                # After termination starts, raise an interrupted exception once all subprocesses
                # have finished cleaning up (and the only errors were from being interrupted)
                if (stopping and (not active_iters) and all([
                        err_info.cls_name == "DagsterExecutionInterruptedError"
                        for err_info in errs.values()
                ])):
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        "Multiprocess executor: interrupted all active child processes",
                        event_specific_data=EngineEventData(),
                    )
                    raise DagsterExecutionInterruptedError()
                elif errs:
                    raise DagsterSubprocessError(
                        "During multiprocess execution errors occurred in child processes:\n{error_list}"
                        .format(error_list="\n".join([
                            "In process {pid}: {err}".format(
                                pid=pid, err=err.to_string())
                            for pid, err in errs.items()
                        ])),
                        subprocess_error_infos=list(errs.values()),
                    )

        yield DagsterEvent.engine_event(
            pipeline_context,
            "Multiprocess executor: parent process exiting after {duration} (pid: {pid})"
            .format(duration=format_duration(timer_result.millis),
                    pid=os.getpid()),
            event_specific_data=EngineEventData.multiprocess(os.getpid()),
        )
Beispiel #24
0
def _launch_scheduled_execution(instance, schedule_def, pipeline, tick,
                                stream):
    pipeline_def = pipeline.get_definition()

    # Run should_execute and halt if it returns False
    schedule_context = ScheduleExecutionContext(instance)
    with user_code_error_boundary(
            ScheduleExecutionError,
            lambda:
            'Error occurred during the execution of should_execute for schedule '
            '{schedule_name}'.format(schedule_name=schedule_def.name),
    ):
        should_execute = schedule_def.should_execute(schedule_context)

    if not should_execute:
        # Update tick to skipped state and return
        tick.update_with_status(ScheduleTickStatus.SKIPPED)
        stream.send(ScheduledExecutionSkipped())
        return

    errors = []

    run_config = {}
    schedule_tags = {}
    try:
        with user_code_error_boundary(
                ScheduleExecutionError,
                lambda:
                'Error occurred during the execution of run_config_fn for schedule '
                '{schedule_name}'.format(schedule_name=schedule_def.name),
        ):
            run_config = schedule_def.get_run_config(schedule_context)
    except DagsterUserCodeExecutionError:
        error_data = serializable_error_info_from_exc_info(sys.exc_info())
        errors.append(error_data)

    try:
        with user_code_error_boundary(
                ScheduleExecutionError,
                lambda:
                'Error occurred during the execution of tags_fn for schedule '
                '{schedule_name}'.format(schedule_name=schedule_def.name),
        ):
            schedule_tags = schedule_def.get_tags(schedule_context)
    except DagsterUserCodeExecutionError:
        error_data = serializable_error_info_from_exc_info(sys.exc_info())
        errors.append(error_data)

    pipeline_tags = pipeline_def.tags or {}
    check_tags(pipeline_tags, 'pipeline_tags')
    tags = merge_dicts(pipeline_tags, schedule_tags)

    mode = schedule_def.mode

    execution_plan_snapshot = None
    try:
        execution_plan = create_execution_plan(
            pipeline_def,
            run_config=run_config,
            mode=mode,
        )
        execution_plan_snapshot = snapshot_from_execution_plan(
            execution_plan, pipeline_def.get_pipeline_snapshot_id())
    except DagsterInvalidConfigError:
        error_data = serializable_error_info_from_exc_info(sys.exc_info())
        errors.append(error_data)

    # Enter the run in the DB with the information we have
    possibly_invalid_pipeline_run = instance.create_run(
        pipeline_name=schedule_def.pipeline_name,
        run_id=None,
        run_config=run_config,
        mode=mode,
        solids_to_execute=pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=pipeline.solid_selection,
        status=None,
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(),
    )

    tick.update_with_status(ScheduleTickStatus.SUCCESS,
                            run_id=possibly_invalid_pipeline_run.run_id)

    # If there were errors, inject them into the event log and fail the run
    if len(errors) > 0:
        for error in errors:
            instance.report_engine_event(
                error.message,
                possibly_invalid_pipeline_run,
                EngineEventData.engine_error(error),
            )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id, errors=errors))
        return

    # Otherwise the run should be valid so lets launch it

    # Need an ExternalPipeline to launch so make one here
    recon_repo = pipeline.get_reconstructable_repository()
    repo_location = InProcessRepositoryLocation(recon_repo)
    external_pipeline = repo_location.get_repository(
        recon_repo.get_definition().name).get_full_external_pipeline(
            pipeline_def.name)

    try:
        launched_run = instance.launch_run(
            possibly_invalid_pipeline_run.run_id, external_pipeline)
    except DagsterLaunchFailedError:
        error = serializable_error_info_from_exc_info(sys.exc_info())
        instance.report_engine_event(
            error.message,
            possibly_invalid_pipeline_run,
            EngineEventData.engine_error(error),
        )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id, errors=[error]))
        return

    stream.send(ScheduledExecutionSuccess(run_id=launched_run.run_id))
    return
Beispiel #25
0
    def execute_run(self, execute_run_args):
        check.inst_param(execute_run_args, "execute_run_args", ExecuteRunArgs)

        with DagsterInstance.from_ref(
                execute_run_args.instance_ref) as instance:
            try:
                pipeline_run = instance.get_run_by_id(
                    execute_run_args.pipeline_run_id)
                event_iterator = self._streaming_query(
                    "ExecuteRun",
                    api_pb2.ExecuteRunRequest,
                    serialized_execute_run_args=serialize_dagster_namedtuple(
                        execute_run_args),
                )
            except Exception as exc:  # pylint: disable=bare-except
                yield instance.report_engine_event(
                    message="Unexpected error in IPC client",
                    pipeline_run=pipeline_run,
                    engine_event_data=EngineEventData.engine_error(
                        serializable_error_info_from_exc_info(sys.exc_info())),
                )
                raise exc

            try:
                for event in event_iterator:
                    yield deserialize_json_to_dagster_namedtuple(
                        event.serialized_dagster_event_or_ipc_error_message)
            except KeyboardInterrupt:
                self.cancel_execution(
                    CancelExecutionRequest(
                        run_id=execute_run_args.pipeline_run_id))
                raise
            except grpc.RpcError as rpc_error:
                if (
                        # posix
                        "Socket closed" in rpc_error.debug_error_string()  # pylint: disable=no-member
                        # windows
                        or "Stream removed" in rpc_error.debug_error_string()  # pylint: disable=no-member
                ):
                    yield instance.report_engine_event(
                        message=
                        "User process: GRPC server for {run_id} terminated unexpectedly"
                        .format(run_id=pipeline_run.run_id),
                        pipeline_run=pipeline_run,
                        engine_event_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info())),
                    )
                    yield instance.report_run_failed(pipeline_run)
                else:
                    yield instance.report_engine_event(
                        message="Unexpected error in IPC client",
                        pipeline_run=pipeline_run,
                        engine_event_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info())),
                    )
                raise rpc_error
            except Exception as exc:  # pylint: disable=bare-except
                yield instance.report_engine_event(
                    message="Unexpected error in IPC client",
                    pipeline_run=pipeline_run,
                    engine_event_data=EngineEventData.engine_error(
                        serializable_error_info_from_exc_info(sys.exc_info())),
                )
                raise exc
Beispiel #26
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, "pipeline_context",
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, "execution_plan", ExecutionPlan)

        limit = self.max_concurrent

        yield DagsterEvent.engine_event(
            pipeline_context,
            "Executing steps using multiprocess engine: parent process (pid: {pid})"
            .format(pid=os.getpid()),
            event_specific_data=EngineEventData.multiprocess(
                os.getpid(),
                step_keys_to_execute=execution_plan.step_keys_to_execute),
        )

        # It would be good to implement a reference tracking algorithm here so we could
        # garbage collection results that are no longer needed by any steps
        # https://github.com/dagster-io/dagster/issues/811
        with time_execution_scope() as timer_result:

            with execution_plan.start(
                    retries=self.retries) as active_execution:
                active_iters = {}
                errors = {}
                term_events = {}
                stopping = False

                while (not stopping
                       and not active_execution.is_complete) or active_iters:
                    try:
                        # start iterators
                        while len(active_iters) < limit and not stopping:
                            steps = active_execution.get_steps_to_execute(
                                limit=(limit - len(active_iters)))

                            if not steps:
                                break

                            for step in steps:
                                step_context = pipeline_context.for_step(step)
                                term_events[step.key] = multiprocessing.Event()
                                active_iters[
                                    step.
                                    key] = self.execute_step_out_of_process(
                                        step_context, step, errors,
                                        term_events)

                        # process active iterators
                        empty_iters = []
                        for key, step_iter in active_iters.items():
                            try:
                                event_or_none = next(step_iter)
                                if event_or_none is None:
                                    continue
                                else:
                                    yield event_or_none
                                    active_execution.handle_event(
                                        event_or_none)

                            except ChildProcessCrashException as crash:
                                serializable_error = serializable_error_info_from_exc_info(
                                    sys.exc_info())
                                yield DagsterEvent.engine_event(
                                    pipeline_context,
                                    ("Multiprocess executor: child process for step {step_key} "
                                     "unexpectedly exited with code {exit_code}"
                                     ).format(step_key=key,
                                              exit_code=crash.exit_code),
                                    EngineEventData.engine_error(
                                        serializable_error),
                                    step_key=key,
                                )
                                step_failure_event = DagsterEvent.step_failure_event(
                                    step_context=pipeline_context.for_step(
                                        active_execution.get_step_by_key(key)),
                                    step_failure_data=StepFailureData(
                                        error=serializable_error,
                                        user_failure_data=None),
                                )
                                active_execution.handle_event(
                                    step_failure_event)
                                yield step_failure_event
                                empty_iters.append(key)
                            except StopIteration:
                                empty_iters.append(key)

                        # clear and mark complete finished iterators
                        for key in empty_iters:
                            del active_iters[key]
                            if term_events[key].is_set():
                                stopping = True
                            del term_events[key]
                            active_execution.verify_complete(
                                pipeline_context, key)

                        # process skips from failures or uncovered inputs
                        for event in active_execution.skipped_step_events_iterator(
                                pipeline_context):
                            yield event

                    # In the very small chance that we get interrupted in this coordination section and not
                    # polling the subprocesses for events - try to clean up gracefully
                    except KeyboardInterrupt:
                        yield DagsterEvent.engine_event(
                            pipeline_context,
                            "Multiprocess engine: received KeyboardInterrupt - forwarding to active child processes",
                            EngineEventData.interrupted(
                                list(term_events.keys())),
                        )
                        stopping = True
                        for event in term_events.values():
                            event.set()

                errs = {pid: err for pid, err in errors.items() if err}
                if errs:
                    raise DagsterSubprocessError(
                        "During multiprocess execution errors occurred in child processes:\n{error_list}"
                        .format(error_list="\n".join([
                            "In process {pid}: {err}".format(
                                pid=pid, err=err.to_string())
                            for pid, err in errs.items()
                        ])),
                        subprocess_error_infos=list(errs.values()),
                    )

        yield DagsterEvent.engine_event(
            pipeline_context,
            "Multiprocess engine: parent process exiting after {duration} (pid: {pid})"
            .format(duration=format_duration(timer_result.millis),
                    pid=os.getpid()),
            event_specific_data=EngineEventData.multiprocess(os.getpid()),
        )
Beispiel #27
0
def _create_scheduler_run(
    instance,
    logger,
    schedule_time,
    repo_location,
    external_schedule,
    external_pipeline,
    run_request,
):
    run_config = run_request.run_config
    schedule_tags = run_request.tags

    execution_plan_errors = []
    execution_plan_snapshot = None

    try:
        external_execution_plan = repo_location.get_external_execution_plan(
            external_pipeline,
            run_config,
            external_schedule.mode,
            step_keys_to_execute=None,
        )
        execution_plan_snapshot = external_execution_plan.execution_plan_snapshot
    except DagsterSubprocessError as e:
        execution_plan_errors.extend(e.subprocess_error_infos)
    except Exception as e:  # pylint: disable=broad-except
        execution_plan_errors.append(
            serializable_error_info_from_exc_info(sys.exc_info()))

    pipeline_tags = external_pipeline.tags or {}
    check_tags(pipeline_tags, "pipeline_tags")
    tags = merge_dicts(pipeline_tags, schedule_tags)

    tags[SCHEDULED_EXECUTION_TIME_TAG] = schedule_time.in_tz("UTC").isoformat()
    if run_request.run_key:
        tags[RUN_KEY_TAG] = run_request.run_key

    # If the run was scheduled correctly but there was an error creating its
    # run config, enter it into the run DB with a FAILURE status
    possibly_invalid_pipeline_run = instance.create_run(
        pipeline_name=external_schedule.pipeline_name,
        run_id=None,
        run_config=run_config,
        mode=external_schedule.mode,
        solids_to_execute=external_pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=external_pipeline.solid_selection,
        status=(PipelineRunStatus.FAILURE if len(execution_plan_errors) > 0
                else PipelineRunStatus.NOT_STARTED),
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
        external_pipeline_origin=external_pipeline.get_external_origin(),
    )

    if len(execution_plan_errors) > 0:
        for error in execution_plan_errors:
            instance.report_engine_event(
                error.message,
                possibly_invalid_pipeline_run,
                EngineEventData.engine_error(error),
            )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        error_string = "\n".join(
            [error.to_string() for error in execution_plan_errors])
        logger.error(
            f"Failed to fetch execution plan for {external_schedule.name}: {error_string}"
        )
    return possibly_invalid_pipeline_run
Beispiel #28
0
    def execute(pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        check.param_invariant(
            isinstance(pipeline_context.executor_config, CeleryConfig),
            'pipeline_context',
            'Expected executor_config to be CeleryConfig got {}'.format(
                pipeline_context.executor_config),
        )

        celery_config = pipeline_context.executor_config

        pipeline_name = pipeline_context.pipeline_def.name

        handle_dict = pipeline_context.execution_target_handle.to_dict()

        instance_ref_dict = pipeline_context.instance.get_ref().to_dict()

        environment_dict = dict(pipeline_context.environment_dict,
                                execution={'in_process': {}})

        mode = pipeline_context.mode_def.name

        run_id = pipeline_context.pipeline_run.run_id

        app = make_app(celery_config)

        pending_steps = execution_plan.execution_deps()

        task_signatures = {}  # Dict[step_key, celery.Signature]
        apply_kwargs = defaultdict(dict)  # Dict[step_key, Dict[str, Any]]

        sort_by_priority = lambda step_key: (-1 * apply_kwargs[step_key][
            'priority'])

        for step_key in execution_plan.step_keys_to_execute:
            step = execution_plan.get_step_by_key(step_key)
            priority = step.metadata.get('dagster-celery/priority',
                                         task_default_priority)
            queue = step.metadata.get('dagster-celery/queue',
                                      task_default_queue)
            task = create_task(app)

            variables = {
                'executionParams': {
                    'selector': {
                        'name': pipeline_name
                    },
                    'environmentConfigData': environment_dict,
                    'mode': mode,
                    'executionMetadata': {
                        'runId': run_id
                    },
                    'stepKeys': [step_key],
                }
            }
            task_signatures[step_key] = task.si(handle_dict, variables,
                                                instance_ref_dict)
            apply_kwargs[step_key] = {
                'priority': priority,
                'queue': queue,
                'routing_key': '{queue}.execute_query'.format(queue=queue),
            }

        step_results = {}  # Dict[ExecutionStep, celery.AsyncResult]
        completed_steps = set({})  # Set[step_key]

        while pending_steps or step_results:
            results_to_pop = []
            for step_key, result in sorted(
                    step_results.items(),
                    key=lambda x: sort_by_priority(x[0])):
                if result.ready():
                    try:
                        step_events = result.get()
                    except Exception:  # pylint: disable=broad-except
                        # We will want to do more to handle the exception here.. maybe subclass Task
                        # Certainly yield an engine or pipeline event
                        step_events = []
                    for step_event in step_events:
                        yield deserialize_json_to_dagster_namedtuple(
                            step_event)
                    results_to_pop.append(step_key)
                    completed_steps.add(step_key)
            for step_key in results_to_pop:
                if step_key in step_results:
                    del step_results[step_key]

            pending_to_pop = []
            for step_key, requirements in pending_steps.items():
                if requirements.issubset(completed_steps):
                    pending_to_pop.append(step_key)

            # This is a slight refinement. If we have n workers idle and schedule m > n steps for
            # execution, the first n steps will be picked up by the idle workers in the order in
            # which they are scheduled (and the following m-n steps will be executed in priority
            # order, provided that it takes longer to execute a step than to schedule it). The test
            # case has m >> n to exhibit this behavior in the absence of this sort step.
            to_execute = sorted(pending_to_pop, key=sort_by_priority)
            for step_key in to_execute:
                try:
                    step_results[step_key] = task_signatures[
                        step_key].apply_async(**apply_kwargs[step_key])
                except Exception:
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Encountered error during celery task submission.'.
                        format(),
                        event_specific_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info()), ),
                    )
                    raise

            for step_key in pending_to_pop:
                if step_key in pending_steps:
                    del pending_steps[step_key]

            time.sleep(TICK_SECONDS)
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext)
        check.inst_param(execution_plan, "execution_plan", ExecutionPlan)

        limit = self.max_concurrent

        yield DagsterEvent.engine_event(
            pipeline_context,
            "Executing steps using multithread executor (pid: {pid})".format(pid=os.getpid()),
            event_specific_data=EngineEventData.in_process(os.getpid(), execution_plan.step_keys_to_execute),
        )

        with time_execution_scope() as timer_result:
            with execution_plan.start(retries=self.retries) as active_execution:
                active_iters = {}
                errors = {}

                while not active_execution.is_complete or active_iters:

                    # start iterators
                    while len(active_iters) < limit:
                        steps = active_execution.get_steps_to_execute(limit=(limit - len(active_iters)))

                        if not steps:
                            break

                        for step in steps:
                            step_context = pipeline_context.for_step(step)
                            active_iters[step.key] = self.execute_step_in_thread(step.key, step_context, errors)

                    # process active iterators
                    empty_iters = []
                    for key, step_iter in active_iters.items():
                        try:
                            event_or_none = next(step_iter)
                            if event_or_none is None:
                                continue
                            yield event_or_none
                            active_execution.handle_event(event_or_none)

                        except ThreadCrashException:
                            serializable_error = serializable_error_info_from_exc_info(sys.exc_info())
                            yield DagsterEvent.engine_event(
                                pipeline_context,
                                f"Multithread executor: thread for step {key} exited unexpectedly",
                                EngineEventData.engine_error(serializable_error),
                            )
                            step_failure_event = DagsterEvent.step_failure_event(
                                step_context=pipeline_context.for_step(active_execution.get_step_by_key(key)),
                                step_failure_data=StepFailureData(error=serializable_error, user_failure_data=None),
                            )
                            active_execution.handle_event(step_failure_event)
                            yield step_failure_event
                            empty_iters.append(key)
                        except StopIteration:
                            empty_iters.append(key)

                    # clear and mark complete finished iterators
                    for key in empty_iters:
                        del active_iters[key]
                        active_execution.verify_complete(pipeline_context, key)

                    # process skipped and abandoned steps
                    for event in active_execution.plan_events_iterator(pipeline_context):
                        yield event

                errs = {tid: err for tid, err in errors.items() if err}
                if errs:
                    raise DagsterThreadError(
                        "During multithread execution errors occurred in threads:\n{error_list}".format(
                            error_list="\n".join(
                                [
                                    "In thread {tid}: {err}".format(tid=tid, err=err.to_string())
                                    for tid, err in errs.items()
                                ]
                            )
                        ),
                        thread_error_infos=list(errs.values()),
                    )

        yield DagsterEvent.engine_event(
            pipeline_context,
            "Multithread executor: parent process exiting after {duration} (pid: {pid})".format(
                duration=format_duration(timer_result.millis), pid=os.getpid()
            ),
            event_specific_data=EngineEventData.multiprocess(os.getpid()),
        )