Exemple #1
0
def _check_execute_pipeline_args(pipeline, environment_dict, mode, preset,
                                 tags, solid_subset, instance):
    pipeline, pipeline_def = _check_pipeline(pipeline)
    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict')
    check.opt_str_param(mode, 'mode')
    check.opt_str_param(preset, 'preset')
    check.invariant(
        not (mode is not None and preset is not None),
        'You may set only one of `mode` (got {mode}) or `preset` (got {preset}).'
        .format(mode=mode, preset=preset),
    )

    tags = check.opt_dict_param(tags, 'tags', key_type=str)
    check.opt_list_param(solid_subset, 'solid_subset', of_type=str)

    if preset is not None:
        pipeline_preset = pipeline_def.get_preset(preset)

        if pipeline_preset.environment_dict is not None:
            check.invariant(
                (not environment_dict)
                or (pipeline_preset.environment_dict == environment_dict),
                'The environment set in preset \'{preset}\' does not agree with the environment '
                'passed in the `environment_dict` argument.'.format(
                    preset=preset),
            )

            environment_dict = pipeline_preset.environment_dict

        if pipeline_preset.solid_subset is not None:
            check.invariant(
                solid_subset is None
                or solid_subset == pipeline_preset.solid_subset,
                'The solid_subset set in preset \'{preset}\', {preset_subset}, does not agree with '
                'the `solid_subset` argument: {solid_subset}'.format(
                    preset=preset,
                    preset_subset=pipeline_preset.solid_subset,
                    solid_subset=solid_subset,
                ),
            )
            solid_subset = pipeline_preset.solid_subset

        check.invariant(
            mode is None or mode == pipeline_preset.mode,
            'Mode {mode} does not agree with the mode set in preset \'{preset}\': '
            '(\'{preset_mode}\')'.format(preset=preset,
                                         preset_mode=pipeline_preset.mode,
                                         mode=mode),
        )

        mode = pipeline_preset.mode

    if mode is not None:
        if not pipeline_def.has_mode_definition(mode):
            raise DagsterInvariantViolationError((
                'You have attempted to execute pipeline {name} with mode {mode}. '
                'Available modes: {modes}').format(
                    name=pipeline_def.name,
                    mode=mode,
                    modes=pipeline_def.available_modes,
                ))
    else:
        if pipeline_def.is_multi_mode:
            raise DagsterInvariantViolationError((
                'Pipeline {name} has multiple modes (Available modes: {modes}) and you have '
                'attempted to execute it without specifying a mode. Set '
                'mode property on the PipelineRun object.').format(
                    name=pipeline_def.name,
                    modes=pipeline_def.available_modes))
        mode = pipeline_def.get_default_mode_name()

    tags = merge_dicts(pipeline_def.tags, tags)

    check.opt_inst_param(instance, 'instance', DagsterInstance)
    instance = instance or DagsterInstance.ephemeral()

    if solid_subset:
        pipeline = pipeline.subset_for_execution(solid_subset)
        pipeline_def = pipeline.get_definition()
    else:
        solid_subset = pipeline_def.solid_subset

    return (pipeline, pipeline_def, environment_dict, instance, mode, tags,
            solid_subset)
Exemple #2
0
def define_context_for_repository_yaml(path, instance=None):
    return DagsterGraphQLContext(
        handle=ExecutionTargetHandle.for_repo_yaml(path),
        instance=instance or DagsterInstance.ephemeral(),
        execution_manager=SynchronousExecutionManager(),
    )
Exemple #3
0
    def StartRun(self, request, _context):
        execute_run_args = check.inst(
            deserialize_json_to_dagster_namedtuple(
                request.serialized_execute_run_args),
            ExecuteRunArgs,
        )

        try:
            execute_run_args = check.inst(
                deserialize_json_to_dagster_namedtuple(
                    request.serialized_execute_run_args),
                ExecuteRunArgs,
            )

            run_id = execute_run_args.pipeline_run_id

            recon_pipeline = self._recon_pipeline_from_origin(
                execute_run_args.pipeline_origin)

        except:  # pylint: disable=bare-except
            return api_pb2.StartRunReply(
                serialized_start_run_result=serialize_dagster_namedtuple(
                    StartRunResult(
                        success=False,
                        message=None,
                        serializable_error_info=
                        serializable_error_info_from_exc_info(sys.exc_info()),
                    )))

        event_queue = multiprocessing.Queue()
        termination_event = multiprocessing.Event()
        execution_process = multiprocessing.Process(
            target=start_run_in_subprocess,
            args=[
                request.serialized_execute_run_args,
                recon_pipeline,
                event_queue,
                termination_event,
            ],
        )

        with self._execution_lock:
            execution_process.start()
            self._executions[run_id] = (
                execution_process,
                DagsterInstance.from_ref(execute_run_args.instance_ref),
            )
            self._termination_events[run_id] = termination_event

        success = None
        message = None
        serializable_error_info = None

        while success is None:
            time.sleep(EVENT_QUEUE_POLL_INTERVAL)
            # We use `get_nowait()` instead of `get()` so that we can handle the case where the
            # execution process has died unexpectedly -- `get()` would hang forever in that case
            try:
                dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait(
                )
            except queue.Empty:
                if not execution_process.is_alive():
                    # subprocess died unexpectedly
                    success = False
                    message = (
                        "GRPC server: Subprocess for {run_id} terminated unexpectedly with "
                        "exit code {exit_code}".format(
                            run_id=run_id,
                            exit_code=execution_process.exitcode,
                        ))
                    serializable_error_info = serializable_error_info_from_exc_info(
                        sys.exc_info())
            else:
                if isinstance(dagster_event_or_ipc_error_message_or_done,
                              StartRunInSubprocessSuccessful):
                    success = True
                elif isinstance(dagster_event_or_ipc_error_message_or_done,
                                RunInSubprocessComplete):
                    continue
                if isinstance(dagster_event_or_ipc_error_message_or_done,
                              IPCErrorMessage):
                    success = False
                    message = dagster_event_or_ipc_error_message_or_done.message
                    serializable_error_info = (
                        dagster_event_or_ipc_error_message_or_done.
                        serializable_error_info)

        # Ensure that if the run failed, we remove it from the executions map before
        # returning so that CanCancel will never return True
        if not success:
            self._check_for_orphaned_runs()

        return api_pb2.StartRunReply(
            serialized_start_run_result=serialize_dagster_namedtuple(
                StartRunResult(
                    success=success,
                    message=message,
                    serializable_error_info=serializable_error_info,
                )))
Exemple #4
0
def execute_backfill_command(cli_args, print_fn, instance=None):
    pipeline_name = cli_args.pop('pipeline_name')
    repo_args = {k: v for k, v in cli_args.items() if k in REPO_ARG_NAMES}
    if pipeline_name and not isinstance(pipeline_name, six.string_types):
        if len(pipeline_name) == 1:
            pipeline_name = pipeline_name[0]

    instance = instance or DagsterInstance.get()
    handle = handle_for_repo_cli_args(repo_args)
    repository = handle.build_repository_definition()
    noprompt = cli_args.get('noprompt')

    # check run launcher
    if not instance.run_launcher:
        raise click.UsageError(
            'A run launcher must be configured before running a backfill. You can configure a run '
            'launcher (e.g. dagster_graphql.launcher.RemoteDagitRunLauncher) in your instance '
            '`dagster.yaml` settings. See '
            'https://docs.dagster.io/latest/deploying/instance/ for more'
            'information.')

    # Resolve pipeline
    if not pipeline_name and noprompt:
        raise click.UsageError('No pipeline specified')
    if not pipeline_name:
        pipeline_name = click.prompt(
            'Select a pipeline to backfill: {}'.format(', '.join(
                repository.pipeline_names)))
    repository = handle.build_repository_definition()
    if not repository.has_pipeline(pipeline_name):
        raise click.UsageError(
            'No pipeline found named `{}`'.format(pipeline_name))

    pipeline = repository.get_pipeline(pipeline_name)

    # Resolve partition set
    all_partition_sets = get_partition_sets_for_handle(handle)
    pipeline_partition_sets = [
        x for x in all_partition_sets if x.pipeline_name == pipeline.name
    ]
    if not pipeline_partition_sets:
        raise click.UsageError(
            'No partition sets found for pipeline `{}`'.format(pipeline.name))
    partition_set_name = cli_args.get('partition_set')
    if not partition_set_name:
        if len(pipeline_partition_sets) == 1:
            partition_set_name = pipeline_partition_sets[0].name
        elif noprompt:
            raise click.UsageError(
                'No partition set specified (see option `--partition-set`)')
        else:
            partition_set_name = click.prompt(
                'Select a partition set to use for backfill: {}'.format(
                    ', '.join(x.name for x in pipeline_partition_sets)))
    partition_set = next(
        (x for x in pipeline_partition_sets if x.name == partition_set_name),
        None)
    if not partition_set:
        raise click.UsageError(
            'No partition set found named `{}`'.format(partition_set_name))

    # Resolve partitions to backfill
    partitions = gen_partitions_from_args(partition_set, cli_args)

    # Resolve priority
    celery_priority = get_backfill_priority_from_args(cli_args)

    # Print backfill info
    print_fn('\n     Pipeline: {}'.format(pipeline.name))
    print_fn('Partition set: {}'.format(partition_set.name))
    print_fn('   Partitions: {}\n'.format(
        print_partition_format(partitions, indent_level=15)))

    # Confirm and launch
    if noprompt or click.confirm(
            'Do you want to proceed with the backfill ({} partitions)?'.format(
                len(partitions))):

        print_fn('Launching runs... ')
        backfill_id = make_new_backfill_id()

        run_tags = merge_dicts(
            PipelineRun.tags_for_backfill_id(backfill_id),
            get_tags_from_args(cli_args),
        )

        # for backwards compatibility - remove once prezi switched over to using tags argument
        if celery_priority is not None:
            run_tags['dagster-celery/run_priority'] = celery_priority

        for partition in partitions:
            # execution_plan = create_execution_plan()
            run = instance.create_run_for_pipeline(
                pipeline=pipeline,
                # execution_plan=execution_plan,
                environment_dict=partition_set.environment_dict_for_partition(
                    partition),
                mode=cli_args.get('mode') or 'default',
                tags=merge_dicts(partition_set.tags_for_partition(partition),
                                 run_tags),
            )
            instance.launch_run(run)
            # Remove once we can handle synchronous execution... currently limited by sqlite
            time.sleep(0.1)

        print_fn('Launched backfill job `{}`'.format(backfill_id))
    else:
        print_fn(' Aborted!')
def test_interrupt_multiproc():
    with seven.TemporaryDirectory() as tempdir:
        file_1 = os.path.join(tempdir, "file_1")
        file_2 = os.path.join(tempdir, "file_2")
        file_3 = os.path.join(tempdir, "file_3")
        file_4 = os.path.join(tempdir, "file_4")

        # launch a thread that waits until the file is written to launch an interrupt
        Thread(target=_send_kbd_int, args=([file_1, file_2, file_3,
                                            file_4], )).start()

        results = []
        try:
            # launch a pipeline that writes a file and loops infinitely
            # next time the launched thread wakes up it will send a keyboard
            # interrupt
            for result in execute_pipeline_iterator(
                    reconstructable(write_files_pipeline),
                    run_config={
                        "solids": {
                            "write_1": {
                                "config": {
                                    "tempfile": file_1
                                }
                            },
                            "write_2": {
                                "config": {
                                    "tempfile": file_2
                                }
                            },
                            "write_3": {
                                "config": {
                                    "tempfile": file_3
                                }
                            },
                            "write_4": {
                                "config": {
                                    "tempfile": file_4
                                }
                            },
                        },
                        "execution": {
                            "multiprocess": {
                                "config": {
                                    "max_concurrent": 4
                                }
                            }
                        },
                        "storage": {
                            "filesystem": {}
                        },
                    },
                    instance=DagsterInstance.local_temp(tempdir=tempdir),
            ):
                results.append(result)
            assert False  # should never reach
        except (DagsterSubprocessError, KeyboardInterrupt):
            pass

        assert [result.event_type for result in results
                ].count(DagsterEventType.STEP_FAILURE) == 4
        assert DagsterEventType.PIPELINE_FAILURE in [
            result.event_type for result in results
        ]
Exemple #6
0
def pipeline_execute_command(**kwargs):
    with delay_interrupts():
        with DagsterInstance.get() as instance:
            execute_execute_command(instance, kwargs)
Exemple #7
0
def pipeline_backfill_command(**kwargs):
    with DagsterInstance.get() as instance:
        execute_backfill_command(kwargs, click.echo, instance)
Exemple #8
0
    def get_context(self, solid_config=None, mode_def=None, run_config=None):
        """Get a dagstermill execution context for interactive exploration and development.

        Args:
            solid_config (Optional[Any]): If specified, this value will be made available on the
                context as its ``solid_config`` property.
            mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to
                use to construct the context. Specify this if you would like a context constructed
                with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode
                with a console logger will be constructed.
            run_config(Optional[dict]): The environment config dict with which to construct
                the context.

        Returns:
            :py:class:`~dagstermill.DagstermillExecutionContext`
        """
        check.opt_inst_param(mode_def, "mode_def", ModeDefinition)
        run_config = check.opt_dict_param(run_config, "run_config", key_type=str)

        # If we are running non-interactively, and there is already a context reconstituted, return
        # that context rather than overwriting it.
        if self.context is not None and isinstance(
            self.context, DagstermillRuntimeExecutionContext
        ):
            return self.context

        if not mode_def:
            mode_def = ModeDefinition(logger_defs={"dagstermill": colored_console_logger})
            run_config["loggers"] = {"dagstermill": {}}

        solid_def = SolidDefinition(
            name="this_solid",
            input_defs=[],
            compute_fn=lambda *args, **kwargs: None,
            output_defs=[],
            description="Ephemeral solid constructed by dagstermill.get_context()",
            required_resource_keys=mode_def.resource_key_set,
        )

        pipeline_def = PipelineDefinition(
            [solid_def], mode_defs=[mode_def], name="ephemeral_dagstermill_pipeline"
        )

        run_id = make_new_run_id()

        # construct stubbed PipelineRun for notebook exploration...
        # The actual pipeline run during pipeline execution will be serialized and reconstituted
        # in the `reconstitute_pipeline_context` call
        pipeline_run = PipelineRun(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            run_config=run_config,
            mode=mode_def.name,
            step_keys_to_execute=None,
            status=PipelineRunStatus.NOT_STARTED,
            tags=None,
        )

        self.in_pipeline = False
        self.solid_def = solid_def
        self.pipeline = pipeline_def

        execution_plan = create_execution_plan(self.pipeline, run_config, mode=mode_def.name)
        with scoped_pipeline_context(
            execution_plan,
            run_config,
            pipeline_run,
            DagsterInstance.ephemeral(),
            scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:

            self.context = DagstermillExecutionContext(
                pipeline_context=pipeline_context,
                solid_config=solid_config,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan,
                    pipeline_context.system_storage_def,
                    pipeline_context.intermediate_storage_def,
                ),
            )

        return self.context
Exemple #9
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        run_config=None,
        executable_dict=None,
        pipeline_run_dict=None,
        solid_handle_kwargs=None,
        instance_ref_dict=None,
    ):
        """Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        """
        check.opt_str_param(output_log_path, "output_log_path")
        check.opt_str_param(marshal_dir, "marshal_dir")
        run_config = check.opt_dict_param(run_config, "run_config", key_type=str)
        check.dict_param(pipeline_run_dict, "pipeline_run_dict")
        check.dict_param(executable_dict, "executable_dict")
        check.dict_param(solid_handle_kwargs, "solid_handle_kwargs")
        check.dict_param(instance_ref_dict, "instance_ref_dict")

        pipeline = ReconstructablePipeline.from_dict(executable_dict)
        pipeline_def = pipeline.get_definition()

        try:
            instance_ref = unpack_value(instance_ref_dict)
            instance = DagsterInstance.from_ref(instance_ref)
        except Exception as err:  # pylint: disable=broad-except
            six.raise_from(
                DagstermillError(
                    "Error when attempting to resolve DagsterInstance from serialized InstanceRef"
                ),
                err,
            )

        pipeline_run = unpack_value(pipeline_run_dict)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle).definition

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline = pipeline

        execution_plan = create_execution_plan(
            self.pipeline,
            run_config,
            mode=pipeline_run.mode,
            step_keys_to_execute=pipeline_run.step_keys_to_execute,
        )

        with scoped_pipeline_context(
            execution_plan,
            run_config,
            pipeline_run,
            instance,
            scoped_resources_builder_cm=self._setup_resources,
            # Set this flag even though we're not in test for clearer error reporting
            raise_on_error=True,
        ) as pipeline_context:
            self.context = DagstermillRuntimeExecutionContext(
                pipeline_context=pipeline_context,
                solid_config=None,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan,
                    pipeline_context.system_storage_def,
                    pipeline_context.intermediate_storage_def,
                ),
            )

        return self.context
def test_max_concurrency_one():
    handle = ExecutionTargetHandle.for_pipeline_python_file(
        __file__, 'infinite_loop_pipeline')

    run_id_one = make_new_run_id()
    run_id_two = make_new_run_id()

    with safe_tempfile_path() as file_one, safe_tempfile_path() as file_two:
        instance = DagsterInstance.local_temp()
        execution_manager = QueueingSubprocessExecutionManager(
            instance, max_concurrent_runs=1)

        run_one = instance.create_run(
            PipelineRun.create_empty_run(
                pipeline_name=infinite_loop_pipeline.name,
                run_id=run_id_one,
                environment_dict={
                    'solids': {
                        'loop': {
                            'config': {
                                'file': file_one
                            }
                        }
                    }
                },
            ))
        run_two = instance.create_run(
            PipelineRun.create_empty_run(
                pipeline_name=infinite_loop_pipeline.name,
                run_id=run_id_two,
                environment_dict={
                    'solids': {
                        'loop': {
                            'config': {
                                'file': file_two
                            }
                        }
                    }
                },
            ))

        execution_manager.execute_pipeline(handle, infinite_loop_pipeline,
                                           run_one, instance)
        execution_manager.execute_pipeline(handle, infinite_loop_pipeline,
                                           run_two, instance)

        while not os.path.exists(file_one):
            execution_manager.check()
            time.sleep(0.1)

        assert execution_manager.is_active(run_id_one)
        assert not execution_manager.is_active(run_id_two)
        assert not os.path.exists(file_two)

        assert execution_manager.terminate(run_id_one)

        while not os.path.exists(file_two):
            execution_manager.check()
            time.sleep(0.1)

        assert not execution_manager.is_active(run_id_one)
        assert execution_manager.is_active(run_id_two)
        assert execution_manager.terminate(run_id_two)
def sqlite_instance_with_manager_disabled():
    with seven.TemporaryDirectory() as temp_dir:
        yield DagsterInstance.local_temp(
            tempdir=temp_dir, overrides={'dagit': {'execution_manager': {'disabled': True}}}
        )
def test_two_runs_running():
    run_id_one = make_new_run_id()
    run_id_two = make_new_run_id()
    handle = ExecutionTargetHandle.for_pipeline_python_file(
        __file__, 'infinite_loop_pipeline')

    with safe_tempfile_path() as file_one, safe_tempfile_path() as file_two:
        instance = DagsterInstance.local_temp()

        execution_manager = SubprocessExecutionManager(instance)

        pipeline_run_one = instance.create_run(
            PipelineRun.create_empty_run(
                pipeline_name=infinite_loop_pipeline.name,
                run_id=run_id_one,
                environment_dict={
                    'solids': {
                        'loop': {
                            'config': {
                                'file': file_one
                            }
                        }
                    }
                },
            ))
        execution_manager.execute_pipeline(handle, infinite_loop_pipeline,
                                           pipeline_run_one, instance)

        pipeline_run_two = instance.create_run(
            PipelineRun.create_empty_run(
                pipeline_name=infinite_loop_pipeline.name,
                run_id=run_id_two,
                environment_dict={
                    'solids': {
                        'loop': {
                            'config': {
                                'file': file_two
                            }
                        }
                    }
                },
            ))

        execution_manager.execute_pipeline(handle, infinite_loop_pipeline,
                                           pipeline_run_two, instance)

        # ensure both runs have begun execution
        while not os.path.exists(file_one) and not os.path.exists(file_two):
            time.sleep(0.1)

        assert execution_manager.is_process_running(run_id_one)
        assert execution_manager.is_process_running(run_id_two)

        assert execution_manager.terminate(run_id_one)

        assert not execution_manager.is_process_running(run_id_one)
        assert execution_manager.is_process_running(run_id_two)

        assert execution_manager.terminate(run_id_two)

        assert not execution_manager.is_process_running(run_id_one)
        assert not execution_manager.is_process_running(run_id_two)
Exemple #13
0
def _check_execute_pipeline_args(pipeline,
                                 run_config,
                                 mode,
                                 preset,
                                 tags,
                                 instance,
                                 solid_selection=None):
    pipeline = _check_pipeline(pipeline)
    pipeline_def = pipeline.get_definition()
    check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition)

    run_config = check.opt_dict_param(run_config, 'run_config')
    check.opt_str_param(mode, 'mode')
    check.opt_str_param(preset, 'preset')
    check.invariant(
        not (mode is not None and preset is not None),
        'You may set only one of `mode` (got {mode}) or `preset` (got {preset}).'
        .format(mode=mode, preset=preset),
    )

    tags = check.opt_dict_param(tags, 'tags', key_type=str)
    check.opt_list_param(solid_selection, 'solid_selection', of_type=str)

    if preset is not None:
        pipeline_preset = pipeline_def.get_preset(preset)

        if pipeline_preset.run_config is not None:
            check.invariant(
                (not run_config) or (pipeline_preset.run_config == run_config),
                'The environment set in preset \'{preset}\' does not agree with the environment '
                'passed in the `run_config` argument.'.format(preset=preset),
            )

            run_config = pipeline_preset.run_config

        # load solid_selection from preset
        if pipeline_preset.solid_selection is not None:
            check.invariant(
                solid_selection is None
                or solid_selection == pipeline_preset.solid_selection,
                'The solid_selection set in preset \'{preset}\', {preset_subset}, does not agree with '
                'the `solid_selection` argument: {solid_selection}'.format(
                    preset=preset,
                    preset_subset=pipeline_preset.solid_selection,
                    solid_selection=solid_selection,
                ),
            )
            solid_selection = pipeline_preset.solid_selection

        check.invariant(
            mode is None or mode == pipeline_preset.mode,
            'Mode {mode} does not agree with the mode set in preset \'{preset}\': '
            '(\'{preset_mode}\')'.format(preset=preset,
                                         preset_mode=pipeline_preset.mode,
                                         mode=mode),
        )

        mode = pipeline_preset.mode

        tags = merge_dicts(pipeline_preset.tags, tags)

    if mode is not None:
        if not pipeline_def.has_mode_definition(mode):
            raise DagsterInvariantViolationError((
                'You have attempted to execute pipeline {name} with mode {mode}. '
                'Available modes: {modes}').format(
                    name=pipeline_def.name,
                    mode=mode,
                    modes=pipeline_def.available_modes,
                ))
    else:
        if pipeline_def.is_multi_mode:
            raise DagsterInvariantViolationError((
                'Pipeline {name} has multiple modes (Available modes: {modes}) and you have '
                'attempted to execute it without specifying a mode. Set '
                'mode property on the PipelineRun object.').format(
                    name=pipeline_def.name,
                    modes=pipeline_def.available_modes))
        mode = pipeline_def.get_default_mode_name()

    tags = merge_dicts(pipeline_def.tags, tags)

    check.opt_inst_param(instance, 'instance', DagsterInstance)
    instance = instance or DagsterInstance.ephemeral()

    # generate pipeline subset from the given solid_selection
    if solid_selection:
        pipeline = pipeline.subset_for_execution(solid_selection)

    return (
        pipeline,
        run_config,
        instance,
        mode,
        tags,
        pipeline.solids_to_execute,
        solid_selection,
    )
Exemple #14
0
def test_in_memory_persist_one_run():
    with DagsterInstance.ephemeral() as instance:
        do_test_single_write_read(instance)
Exemple #15
0
def pipeline_print_command(verbose, **cli_args):
    with DagsterInstance.get() as instance:
        return execute_print_command(verbose, cli_args, click.echo, instance)
Exemple #16
0
def test_using_s3_for_subplan(s3_bucket):
    pipeline_def = define_inty_pipeline()

    run_config = {'storage': {'s3': {'config': {'s3_bucket': s3_bucket}}}}

    run_id = make_new_run_id()

    execution_plan = create_execution_plan(pipeline_def, run_config=run_config)

    assert execution_plan.get_step_by_key('return_one.compute')

    step_keys = ['return_one.compute']
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name=pipeline_def.name,
                               run_id=run_id,
                               run_config=run_config)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, 'return_one.compute')
    with scoped_pipeline_context(
            execution_plan.build_subset_plan(['return_one.compute']),
            run_config,
            pipeline_run,
            instance,
    ) as context:

        intermediates_manager = S3IntermediateStorage(
            s3_bucket,
            run_id,
            s3_session=context.scoped_resources_builder.build(
                required_resource_keys={'s3'}, ).s3,
        )
        step_output_handle = StepOutputHandle('return_one.compute')
        assert intermediates_manager.has_intermediate(context,
                                                      step_output_handle)
        assert intermediates_manager.get_intermediate(
            context, Int, step_output_handle).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['add_one.compute']),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(add_one_step_events, 'add_one.compute')
    with scoped_pipeline_context(
            execution_plan.build_subset_plan(['add_one.compute']),
            run_config,
            pipeline_run,
            instance,
    ) as context:
        step_output_handle = StepOutputHandle('add_one.compute')
        assert intermediates_manager.has_intermediate(context,
                                                      step_output_handle)
        assert intermediates_manager.get_intermediate(
            context, Int, step_output_handle).obj == 2
Exemple #17
0
def pipeline_list_versions_command(**kwargs):
    with DagsterInstance.get() as instance:
        execute_list_versions_command(instance, kwargs)
Exemple #18
0
def test_all_step_events():  # pylint: disable=too-many-locals
    workspace = workspace_from_load_target(
        PythonFileTarget(__file__, define_test_events_pipeline.__name__))
    pipeline_def = define_test_events_pipeline()
    mode = pipeline_def.get_default_mode_name()
    instance = DagsterInstance.ephemeral()
    execution_plan = create_execution_plan(pipeline_def, mode=mode)
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline_def, execution_plan=execution_plan, mode=mode)
    step_levels = execution_plan.topological_step_levels()

    unhandled_events = STEP_EVENTS.copy()

    # Exclude types that are not step events
    ignored_events = {
        'LogMessageEvent',
        'PipelineStartEvent',
        'PipelineSuccessEvent',
        'PipelineInitFailureEvent',
        'PipelineFailureEvent',
    }

    event_counts = defaultdict(int)

    for step_level in step_levels:
        for step in step_level:

            variables = {
                'executionParams': {
                    'selector': {
                        'repositoryLocationName': 'test_events',
                        'repositoryName': '<<unnamed>>',
                        'pipelineName': pipeline_def.name,
                    },
                    'runConfigData': {
                        'storage': {
                            'filesystem': {}
                        }
                    },
                    'mode': mode,
                    'executionMetadata': {
                        'runId': pipeline_run.run_id
                    },
                    'stepKeys': [step.key],
                }
            }
            res = execute_query(
                workspace,
                EXECUTE_PLAN_MUTATION,
                variables,
                instance=instance,
            )

            # go through the same dict, decrement all the event records we've seen from the GraphQL
            # response
            if not res.get('errors'):
                assert 'data' in res, res
                assert 'executePlan' in res['data'], res
                assert 'stepEvents' in res['data']['executePlan'], res
                step_events = res['data']['executePlan']['stepEvents']

                events = [
                    dagster_event_from_dict(e, pipeline_def.name)
                    for e in step_events
                    if e['__typename'] not in ignored_events
                ]

                for event in events:
                    if event.step_key:
                        key = event.step_key + '.' + event.event_type_value
                    else:
                        key = event.event_type_value
                    event_counts[key] -= 1
                unhandled_events -= {
                    DagsterEventType(e.event_type_value)
                    for e in events
                }
            else:
                raise Exception(res['errors'])

    # build up a dict, incrementing all the event records we've produced in the run storage
    logs = instance.all_logs(pipeline_run.run_id)
    for log in logs:
        if not log.dagster_event or (DagsterEventType(
                log.dagster_event.event_type_value) not in STEP_EVENTS.union(
                    set([DagsterEventType.ENGINE_EVENT]))):
            continue
        if log.dagster_event.step_key:
            key = log.dagster_event.step_key + '.' + log.dagster_event.event_type_value
        else:
            key = log.dagster_event.event_type_value
        event_counts[key] += 1

    # Ensure we've processed all the events that were generated in the run storage
    assert sum(event_counts.values()) == 0

    # Ensure we've handled the universe of event types
    # Why are these retry events not handled? Because right now there is no way to configure retries
    # on executePlan -- this needs to change, and we should separate the ExecutionParams that get
    # sent to executePlan fromm those that get sent to startPipelineExecution and friends
    assert unhandled_events == {
        DagsterEventType.STEP_UP_FOR_RETRY, DagsterEventType.STEP_RESTARTED
    }
Exemple #19
0
def pipeline_launch_command(**kwargs):
    with DagsterInstance.get() as instance:
        return execute_launch_command(instance, kwargs)
Exemple #20
0
def launch_scheduled_execution(output_file, schedule_name,
                               override_system_timezone, **kwargs):
    with (mock_system_timezone(override_system_timezone)
          if override_system_timezone else nullcontext()):
        with ipc_write_stream(output_file) as stream:
            with DagsterInstance.get() as instance:
                repository_origin = get_repository_origin_from_kwargs(kwargs)
                job_origin = repository_origin.get_job_origin(schedule_name)

                # open the tick scope before we load any external artifacts so that
                # load errors are stored in DB
                with _schedule_tick_context(
                        instance,
                        stream,
                        JobTickData(
                            job_origin_id=job_origin.get_id(),
                            job_name=schedule_name,
                            job_type=JobType.SCHEDULE,
                            status=JobTickStatus.STARTED,
                            timestamp=time.time(),
                        ),
                ) as tick_context:
                    with get_repository_location_from_kwargs(
                            kwargs) as repo_location:
                        repo_dict = repo_location.get_repositories()
                        check.invariant(
                            repo_dict and len(repo_dict) == 1,
                            "Passed in arguments should reference exactly one repository, instead there are {num_repos}"
                            .format(num_repos=len(repo_dict)),
                        )
                        external_repo = next(iter(repo_dict.values()))
                        if not schedule_name in [
                                schedule.name for schedule in
                                external_repo.get_external_schedules()
                        ]:
                            raise DagsterInvariantViolationError(
                                "Could not find schedule named {schedule_name}"
                                .format(schedule_name=schedule_name), )

                        external_schedule = external_repo.get_external_schedule(
                            schedule_name)

                        # Validate that either the schedule has no timezone or it matches
                        # the system timezone
                        schedule_timezone = external_schedule.execution_timezone
                        if schedule_timezone:
                            system_timezone = pendulum.now().timezone.name

                            if system_timezone != external_schedule.execution_timezone:
                                raise DagsterInvariantViolationError(
                                    "Schedule {schedule_name} is set to execute in {schedule_timezone}, "
                                    "but this scheduler can only run in the system timezone, "
                                    "{system_timezone}. Use DagsterDaemonScheduler if you want to be able "
                                    "to execute schedules in arbitrary timezones."
                                    .format(
                                        schedule_name=external_schedule.name,
                                        schedule_timezone=schedule_timezone,
                                        system_timezone=system_timezone,
                                    ), )

                        _launch_scheduled_executions(instance, repo_location,
                                                     external_repo,
                                                     external_schedule,
                                                     tick_context)
Exemple #21
0
def pipeline_list_command(**kwargs):
    with DagsterInstance.get() as instance:
        return execute_list_command(kwargs, click.echo, instance)
Exemple #22
0
def test_execute_hammer_through_dagit():
    handle = ExecutionTargetHandle.for_pipeline_python_file(
        script_relative_path(
            '../../../examples/dagster_examples/toys/hammer.py'),
        'hammer_pipeline')
    instance = DagsterInstance.local_temp()

    execution_manager = SubprocessExecutionManager(instance)

    context = DagsterGraphQLContext(handle=handle,
                                    execution_manager=execution_manager,
                                    instance=instance)

    executor = SyncExecutor()

    variables = {
        'executionParams': {
            'environmentConfigData': {
                'storage': {
                    'filesystem': {}
                },
                'execution': {
                    'dask': {}
                }
            },
            'selector': {
                'name': handle.build_pipeline_definition().name
            },
            'mode': 'default',
        }
    }

    start_pipeline_result = graphql(
        request_string=START_PIPELINE_EXECUTION_MUTATION,
        schema=create_schema(),
        context=context,
        variables=variables,
        executor=executor,
    )

    run_id = start_pipeline_result.data['startPipelineExecution']['run'][
        'runId']

    context.execution_manager.join()

    subscription = execute_dagster_graphql(context,
                                           SUBSCRIPTION_QUERY,
                                           variables={'runId': run_id})

    subscribe_results = []
    subscription.subscribe(subscribe_results.append)

    messages = [
        x['__typename']
        for x in subscribe_results[0].data['pipelineRunLogs']['messages']
    ]

    assert 'PipelineProcessStartEvent' in messages
    assert 'PipelineProcessStartedEvent' in messages
    assert 'PipelineStartEvent' in messages
    assert 'PipelineSuccessEvent' in messages
    assert 'PipelineProcessExitedEvent' in messages
Exemple #23
0
def test_run_groups_over_time():
    with seven.TemporaryDirectory() as tempdir:
        instance = DagsterInstance.local_temp(tempdir=tempdir)

        repo_1 = get_repo_at_time_1()

        full_evolve_run_id = execute_pipeline(
            repo_1.get_pipeline('evolving_pipeline'), instance=instance).run_id
        foo_run_id = execute_pipeline(repo_1.get_pipeline('foo_pipeline'),
                                      instance=instance).run_id
        evolve_a_run_id = execute_pipeline(
            repo_1.get_pipeline('evolving_pipeline').get_pipeline_subset_def(
                {'solid_A'}),
            instance=instance,
        ).run_id
        evolve_b_run_id = execute_pipeline(
            repo_1.get_pipeline('evolving_pipeline').get_pipeline_subset_def(
                {'solid_B'}),
            instance=instance,
        ).run_id

        context_at_time_1 = define_context_for_file(__file__,
                                                    'get_repo_at_time_1',
                                                    instance)

        result = execute_dagster_graphql(context_at_time_1,
                                         ALL_RUN_GROUPS_QUERY)
        assert result.data
        assert 'runGroupsOrError' in result.data
        assert 'results' in result.data['runGroupsOrError']
        assert len(result.data['runGroupsOrError']['results']) == 4

        t1_runs = {
            run['runId']: run
            for group in result.data['runGroupsOrError']['results']
            for run in group['runs']
        }

        # test full_evolve_run_id
        assert t1_runs[full_evolve_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'evolving_pipeline',
            'solidSelection': None,
        }

        # test foo_run_id
        assert t1_runs[foo_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'foo_pipeline',
            'solidSelection': None,
        }

        # test evolve_a_run_id
        assert t1_runs[evolve_a_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'evolving_pipeline',
            'solidSelection': ['solid_A'],
        }
        assert t1_runs[evolve_a_run_id]['pipelineSnapshotId']

        # test evolve_b_run_id
        assert t1_runs[evolve_b_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'evolving_pipeline',
            'solidSelection': ['solid_B'],
        }

        context_at_time_2 = define_context_for_file(__file__,
                                                    'get_repo_at_time_2',
                                                    instance)

        result = execute_dagster_graphql(context_at_time_2,
                                         ALL_RUN_GROUPS_QUERY)
        assert 'runGroupsOrError' in result.data
        assert 'results' in result.data['runGroupsOrError']
        assert len(result.data['runGroupsOrError']['results']) == 4

        t2_runs = {
            run['runId']: run
            for group in result.data['runGroupsOrError']['results']
            for run in group['runs']
        }

        # test full_evolve_run_id
        assert t2_runs[full_evolve_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'evolving_pipeline',
            'solidSelection': None,
        }

        # test evolve_a_run_id
        assert t2_runs[evolve_a_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'evolving_pipeline',
            'solidSelection': ['solid_A'],
        }
        assert t2_runs[evolve_a_run_id]['pipelineSnapshotId']

        # names same
        assert (t1_runs[full_evolve_run_id]['pipeline']['name'] ==
                t2_runs[evolve_a_run_id]['pipeline']['name'])

        # snapshots differ
        assert (t1_runs[full_evolve_run_id]['pipelineSnapshotId'] !=
                t2_runs[evolve_a_run_id]['pipelineSnapshotId'])

        # pipeline name changed
        assert t2_runs[foo_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'foo_pipeline',
            'solidSelection': None,
        }
        # subset no longer valid - b renamed
        assert t2_runs[evolve_b_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'evolving_pipeline',
            'solidSelection': ['solid_B'],
        }
Exemple #24
0
def execute_inner(step_key, pipeline_run, instance_ref):
    instance = DagsterInstance.from_ref(instance_ref)
    inner_step(instance, pipeline_run, step_key)
def test_using_gcs_for_subplan(gcs_bucket):
    pipeline_def = define_inty_pipeline()

    environment_dict = {'storage': {'gcs': {'config': {'gcs_bucket': gcs_bucket}}}}

    run_id = str(uuid.uuid4())

    execution_plan = create_execution_plan(
        pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id)
    )

    assert execution_plan.get_step_by_key('return_one.compute')

    step_keys = ['return_one.compute']
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun.create_empty_run(
        pipeline_def.name, run_id=run_id, environment_dict=environment_dict
    )

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys),
            environment_dict=environment_dict,
            pipeline_run=pipeline_run,
            instance=instance,
        )
    )

    assert get_step_output(return_one_step_events, 'return_one.compute')
    with scoped_pipeline_context(
        pipeline_def,
        environment_dict,
        pipeline_run,
        instance,
        execution_plan.build_subset_plan(['return_one.compute']),
    ) as context:
        store = GCSIntermediateStore(
            gcs_bucket,
            run_id,
            client=context.scoped_resources_builder.build(
                mapper_fn=SolidInvocation.default_resource_mapper_fn,
                required_resource_keys={'gcs'},
            ).gcs.client,
        )
        assert store.has_intermediate(context, 'return_one.compute')
        assert store.get_intermediate(context, 'return_one.compute', Int).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['add_one.compute']),
            environment_dict=environment_dict,
            pipeline_run=pipeline_run,
            instance=instance,
        )
    )

    assert get_step_output(add_one_step_events, 'add_one.compute')
    with scoped_pipeline_context(
        pipeline_def,
        environment_dict,
        pipeline_run,
        instance,
        execution_plan.build_subset_plan(['return_one.compute']),
    ) as context:
        assert store.has_intermediate(context, 'add_one.compute')
        assert store.get_intermediate(context, 'add_one.compute', Int).obj == 2
Exemple #26
0
def test_create_app_with_multiple_workspace_files():
    workspace = load_workspace_from_yaml_paths([
        file_relative_path(__file__, './workspace.yaml'),
        file_relative_path(__file__, './override.yaml'),
    ])
    assert create_app_from_workspace(workspace, DagsterInstance.ephemeral())
Exemple #27
0
    def ExecuteRun(self, request, _context):
        try:
            execute_run_args = deserialize_json_to_dagster_namedtuple(
                request.serialized_execute_run_args)
            check.inst_param(execute_run_args, "execute_run_args",
                             ExecuteRunArgs)

            run_id = execute_run_args.pipeline_run_id

            recon_pipeline = self._recon_pipeline_from_origin(
                execute_run_args.pipeline_origin)

        except:  # pylint: disable=bare-except
            yield api_pb2.ExecuteRunEvent(
                serialized_dagster_event_or_ipc_error_message=
                serialize_dagster_namedtuple(
                    IPCErrorMessage(
                        serializable_error_info=
                        serializable_error_info_from_exc_info(sys.exc_info()),
                        message="Error during RPC setup for ExecuteRun",
                    )))
            return

        event_queue = multiprocessing.Queue()
        termination_event = multiprocessing.Event()
        execution_process = multiprocessing.Process(
            target=execute_run_in_subprocess,
            args=[
                request.serialized_execute_run_args,
                recon_pipeline,
                event_queue,
                termination_event,
            ],
        )
        with self._execution_lock:
            execution_process.start()
            self._executions[run_id] = (
                execution_process,
                DagsterInstance.from_ref(execute_run_args.instance_ref),
            )
            self._termination_events[run_id] = termination_event

        done = False
        while not done:
            try:
                # We use `get_nowait()` instead of `get()` so that we can handle the case where the
                # execution process has died unexpectedly -- `get()` would hang forever in that case
                dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait(
                )
            except queue.Empty:
                if not execution_process.is_alive():
                    # subprocess died unexpectedly
                    yield api_pb2.ExecuteRunEvent(
                        serialized_dagster_event_or_ipc_error_message=
                        serialize_dagster_namedtuple(
                            IPCErrorMessage(
                                serializable_error_info=
                                serializable_error_info_from_exc_info(
                                    sys.exc_info()),
                                message=
                                ("GRPC server: Subprocess for {run_id} terminated unexpectedly"
                                 ).format(run_id=run_id),
                            )))
                    done = True
                time.sleep(EVENT_QUEUE_POLL_INTERVAL)
            else:
                if isinstance(dagster_event_or_ipc_error_message_or_done,
                              RunInSubprocessComplete):
                    done = True
                elif isinstance(dagster_event_or_ipc_error_message_or_done,
                                StartRunInSubprocessSuccessful):
                    continue
                else:
                    yield api_pb2.ExecuteRunEvent(
                        serialized_dagster_event_or_ipc_error_message=
                        serialize_dagster_namedtuple(
                            dagster_event_or_ipc_error_message_or_done))

        with self._execution_lock:
            if run_id in self._executions:
                del self._executions[run_id]
            if run_id in self._termination_events:
                del self._termination_events[run_id]
Exemple #28
0
def test_multiprocessing_execution_for_composite_solid_with_config_mapping():
    environment_dict = {
        'solids': {
            'composite_with_nested_config_solid_and_config_mapping': {
                'config': {'foo': 'baz', 'bar': 3}
            }
        }
    }

    run_id = make_new_run_id()
    handle = ExecutionTargetHandle.for_pipeline_python_file(
        __file__, 'composite_pipeline_with_config_mapping'
    )

    instance = DagsterInstance.local_temp()
    pipeline_run = instance.create_run(
        PipelineRun(
            pipeline_name=composite_pipeline_with_config_mapping.name,
            run_id=run_id,
            selector=ExecutionSelector('nonce'),
            environment_dict=environment_dict,
            mode='default',
            reexecution_config=None,
            step_keys_to_execute=None,
            tags=None,
            status=PipelineRunStatus.NOT_STARTED,
        )
    )
    execution_manager = SubprocessExecutionManager(instance)
    execution_manager.execute_pipeline(
        handle, composite_pipeline_with_config_mapping, pipeline_run, instance
    )
    execution_manager.join()
    assert instance.get_run_by_id(run_id).status == PipelineRunStatus.SUCCESS

    environment_dict = {
        'solids': {
            'composite_with_nested_config_solid_and_config_mapping': {
                'config': {'foo': 'baz', 'bar': 3}
            }
        },
        'execution': {'multiprocess': {}},
        'storage': {'filesystem': {}},
    }

    run_id = make_new_run_id()

    pipeline_run = instance.create_run(
        PipelineRun(
            pipeline_name=composite_pipeline.name,
            run_id=run_id,
            selector=ExecutionSelector('nonce'),
            environment_dict=environment_dict,
            mode='default',
            reexecution_config=None,
            step_keys_to_execute=None,
            tags=None,
            status=PipelineRunStatus.NOT_STARTED,
        )
    )
    execution_manager = SubprocessExecutionManager(instance)
    execution_manager.execute_pipeline(handle, composite_pipeline, pipeline_run, instance)

    execution_manager.join()
    assert instance.get_run_by_id(run_id).status == PipelineRunStatus.SUCCESS
Exemple #29
0
def test_compute_log_manager_with_envvar(gcs_bucket):
    @job
    def simple():
        @op
        def easy(context):
            context.log.info("easy")
            print(HELLO_WORLD)  # pylint: disable=print-call
            return "easy"

        easy()

    with open(os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")) as f:
        with tempfile.TemporaryDirectory() as temp_dir:
            with environ({"ENV_VAR": f.read(), "DAGSTER_HOME": temp_dir}):
                run_store = SqliteRunStorage.from_local(temp_dir)
                event_store = SqliteEventLogStorage(temp_dir)
                manager = GCSComputeLogManager(
                    bucket=gcs_bucket,
                    prefix="my_prefix",
                    local_dir=temp_dir,
                    json_credentials_envvar="ENV_VAR",
                )
                instance = DagsterInstance(
                    instance_type=InstanceType.PERSISTENT,
                    local_artifact_storage=LocalArtifactStorage(temp_dir),
                    run_storage=run_store,
                    event_storage=event_store,
                    compute_log_manager=manager,
                    run_coordinator=DefaultRunCoordinator(),
                    run_launcher=DefaultRunLauncher(),
                    ref=InstanceRef.from_dir(temp_dir),
                )
                result = simple.execute_in_process(instance=instance)
                compute_steps = [
                    event.step_key for event in result.all_node_events
                    if event.event_type == DagsterEventType.STEP_START
                ]
                assert len(compute_steps) == 1
                step_key = compute_steps[0]

                stdout = manager.read_logs_file(result.run_id, step_key,
                                                ComputeIOType.STDOUT)
                assert stdout.data == HELLO_WORLD + SEPARATOR

                stderr = manager.read_logs_file(result.run_id, step_key,
                                                ComputeIOType.STDERR)
                for expected in EXPECTED_LOGS:
                    assert expected in stderr.data

                # Check GCS directly
                stderr_gcs = (storage.Client().bucket(gcs_bucket).blob(
                    f"my_prefix/storage/{result.run_id}/compute_logs/easy.err"
                ).download_as_bytes().decode("utf-8"))

                for expected in EXPECTED_LOGS:
                    assert expected in stderr_gcs

                # Check download behavior by deleting locally cached logs
                compute_logs_dir = os.path.join(temp_dir, result.run_id,
                                                "compute_logs")
                for filename in os.listdir(compute_logs_dir):
                    os.unlink(os.path.join(compute_logs_dir, filename))

                stdout = manager.read_logs_file(result.run_id, step_key,
                                                ComputeIOType.STDOUT)
                assert stdout.data == HELLO_WORLD + SEPARATOR

                stderr = manager.read_logs_file(result.run_id, step_key,
                                                ComputeIOType.STDERR)
                for expected in EXPECTED_LOGS:
                    assert expected in stderr.data
Exemple #30
0
def test_runs_over_time():
    with seven.TemporaryDirectory() as temp_dir:
        instance = DagsterInstance.local_temp(temp_dir)

        repo_1 = get_repo_at_time_1()

        full_evolve_run_id = execute_pipeline(
            repo_1.get_pipeline('evolving_pipeline'), instance=instance).run_id
        foo_run_id = execute_pipeline(repo_1.get_pipeline('foo_pipeline'),
                                      instance=instance).run_id
        evolve_a_run_id = execute_pipeline(
            repo_1.get_pipeline('evolving_pipeline').subset_for_execution(
                ['solid_A']),
            instance=instance,
        ).run_id
        evolve_b_run_id = execute_pipeline(
            repo_1.get_pipeline('evolving_pipeline').subset_for_execution(
                ['solid_B']),
            instance=instance,
        ).run_id

        context_at_time_1 = define_context_for_file(__file__,
                                                    'get_repo_at_time_1',
                                                    instance)

        result = execute_dagster_graphql(context_at_time_1, ALL_RUNS_QUERY)
        assert result.data

        t1_runs = {
            run['runId']: run
            for run in result.data['pipelineRunsOrError']['results']
        }

        assert t1_runs[full_evolve_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'evolving_pipeline',
            'solidSubset': None,
        }

        assert t1_runs[foo_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'foo_pipeline',
            'solidSubset': None,
        }

        assert t1_runs[evolve_a_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'evolving_pipeline',
            'solidSubset': ['solid_A'],
        }

        assert t1_runs[evolve_b_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'evolving_pipeline',
            'solidSubset': ['solid_B'],
        }

        context_at_time_2 = define_context_for_file(__file__,
                                                    'get_repo_at_time_2',
                                                    instance)

        result = execute_dagster_graphql(context_at_time_2, ALL_RUNS_QUERY)
        assert result.data

        t2_runs = {
            run['runId']: run
            for run in result.data['pipelineRunsOrError']['results']
        }

        assert t2_runs[full_evolve_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'evolving_pipeline',
            'solidSubset': None,
        }

        assert t2_runs[evolve_a_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'evolving_pipeline',
            'solidSubset': ['solid_A'],
        }
        # pipeline name changed
        assert t2_runs[foo_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'foo_pipeline',
            'solidSubset': None,
        }
        # subset no longer valid - b renamed
        assert t2_runs[evolve_b_run_id]['pipeline'] == {
            '__typename': 'PipelineSnapshot',
            'name': 'evolving_pipeline',
            'solidSubset': ['solid_B'],
        }