Ejemplo n.º 1
0
def test_execute_on_celery_k8s_with_hard_failure(  # pylint: disable=redefined-outer-name
        dagster_docker_image, dagster_instance,
        set_dagster_k8s_pipeline_run_namespace_env, dagit_url):
    run_config = merge_dicts(
        merge_dicts(
            merge_yamls([
                os.path.join(get_test_project_environments_path(),
                             "env_s3.yaml"),
            ]),
            get_celery_engine_config(
                dagster_docker_image=dagster_docker_image,
                job_namespace={"env": "DAGSTER_K8S_PIPELINE_RUN_NAMESPACE"},
            ),
        ),
        {"solids": {
            "hard_fail_or_0": {
                "config": {
                    "fail": True
                }
            }
        }},
    )

    run_id = launch_run_over_graphql(dagit_url,
                                     run_config=run_config,
                                     pipeline_name="hard_failer")

    # Check that pipeline run is marked as failed
    pipeline_run_status_failure = False
    start_time = datetime.datetime.now()
    timeout = datetime.timedelta(0, 120)

    while datetime.datetime.now() < start_time + timeout:
        pipeline_run = dagster_instance.get_run_by_id(run_id)
        if pipeline_run.status == PipelineRunStatus.FAILURE:
            pipeline_run_status_failure = True
            break
        time.sleep(5)
    assert pipeline_run_status_failure

    # Check for step failure for hard_fail_or_0.compute
    start_time = datetime.datetime.now()
    step_failure_found = False
    while datetime.datetime.now() < start_time + timeout:
        event_records = dagster_instance.all_logs(run_id)
        for event_record in event_records:
            if event_record.dagster_event:
                if (event_record.dagster_event.event_type
                        == DagsterEventType.STEP_FAILURE
                        and event_record.dagster_event.step_key
                        == "hard_fail_or_0"):
                    step_failure_found = True
                    break
        time.sleep(5)
    assert step_failure_found
Ejemplo n.º 2
0
def test_execute_on_celery_k8s_default(  # pylint: disable=redefined-outer-name
    dagster_docker_image,
    dagster_instance,
    helm_namespace,
    dagit_url,
):
    run_config = merge_dicts(
        merge_yamls([
            os.path.join(get_test_project_environments_path(), "env.yaml"),
            os.path.join(get_test_project_environments_path(), "env_s3.yaml"),
        ]),
        get_celery_engine_config(dagster_docker_image=dagster_docker_image,
                                 job_namespace=helm_namespace),
    )

    run_id = launch_run_over_graphql(dagit_url,
                                     run_config=run_config,
                                     pipeline_name="demo_pipeline_celery")

    result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id,
                                           namespace=helm_namespace)

    assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result)

    updated_run = dagster_instance.get_run_by_id(run_id)
    assert updated_run.tags[DOCKER_IMAGE_TAG] == dagster_docker_image
Ejemplo n.º 3
0
    def _root_manager(input_context: InputContext) -> Any:
        source_asset_key = cast(AssetKey, input_context.asset_key)
        source_asset = source_assets_by_key[source_asset_key]

        @op(out={source_asset_key.path[-1]: Out(asset_key=source_asset_key)})
        def _op():
            pass

        output_context = build_output_context(
            name=source_asset_key.path[-1],
            step_key="none",
            solid_def=_op,
            metadata=merge_dicts(source_asset.metadata or {},
                                 {"logical_asset_key": source_asset_key}),
        )
        input_context_with_upstream = build_input_context(
            name=input_context.name,
            metadata=input_context.metadata,
            config=input_context.config,
            dagster_type=input_context.dagster_type,
            upstream_output=output_context,
            op_def=input_context.op_def,
        )

        io_manager = getattr(cast(Any, input_context.resources),
                             source_asset.io_manager_key)
        return io_manager.load_input(input_context_with_upstream)
Ejemplo n.º 4
0
def make_run_config(scratch_dir, mode):
    if mode in ["external", "request_retry"]:
        step_launcher_resource_keys = [
            "first_step_launcher", "second_step_launcher"
        ]
    else:
        step_launcher_resource_keys = ["second_step_launcher"]
    return deep_merge_dicts(
        RUN_CONFIG_BASE,
        {
            "resources":
            merge_dicts(
                {"io_manager": {
                    "config": {
                        "base_dir": scratch_dir
                    }
                }},
                {
                    step_launcher_resource_key: {
                        "config": {
                            "scratch_dir": scratch_dir
                        }
                    }
                    for step_launcher_resource_key in
                    step_launcher_resource_keys
                },
            ),
        },
    )
Ejemplo n.º 5
0
def test_execute_on_celery_k8s_job_api_with_legacy_configmap_set(  # pylint: disable=redefined-outer-name
        dagster_docker_image, dagster_instance, helm_namespace, dagit_url):
    # Originally, jobs needed to include "dagster-pipeline-env" to pick up needed config when
    # using the helm chart - it's no longer needed, but verify that nothing breaks if it's included
    run_config = merge_dicts(
        merge_yamls([
            os.path.join(get_test_project_environments_path(), "env.yaml"),
            os.path.join(get_test_project_environments_path(), "env_s3.yaml"),
        ]),
        get_celery_job_engine_config(
            dagster_docker_image=dagster_docker_image,
            job_namespace=helm_namespace,
            include_dagster_pipeline_env=True,
        ),
    )

    run_id = launch_run_over_graphql(dagit_url,
                                     run_config=run_config,
                                     pipeline_name="demo_job_celery")

    result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id,
                                           namespace=helm_namespace)

    assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result)

    updated_run = dagster_instance.get_run_by_id(run_id)
    assert updated_run.tags[DOCKER_IMAGE_TAG] == dagster_docker_image
Ejemplo n.º 6
0
def _composite_descent(parent_stack, solids_config_dict, resource_defs):
    """
    The core implementation of composite_descent. This yields a stream of
    SolidConfigEntry. This is used by composite_descent to construct a
    dictionary.

    It descends over the entire solid hierarchy, constructing an entry
    for every handle. If it encounters a composite solid instance
    with a config mapping, it will invoke that config mapping fn,
    producing the config that is necessary to configure the child solids.

    This process unrolls recursively as you descend down the tree.
    """

    for solid in parent_stack.current_container.solids:

        current_stack = parent_stack.descend(solid)
        current_handle = current_stack.handle

        current_solid_config = solids_config_dict.get(solid.name, {})

        # the base case
        if isinstance(solid.definition, SolidDefinition):
            config_mapped_solid_config = solid.definition.apply_config_mapping(
                {"config": current_solid_config.get("config")})
            if not config_mapped_solid_config.success:
                raise DagsterInvalidConfigError(
                    "Error in config for solid {}".format(solid.name),
                    config_mapped_solid_config.errors,
                    config_mapped_solid_config,
                )

            complete_config_object = merge_dicts(
                current_solid_config, config_mapped_solid_config.value)
            yield SolidConfigEntry(
                current_handle, SolidConfig.from_dict(complete_config_object))
            continue

        graph_def = check.inst(solid.definition, GraphDefinition)

        yield SolidConfigEntry(
            current_handle,
            SolidConfig.from_dict({
                "inputs":
                current_solid_config.get("inputs"),
                "outputs":
                current_solid_config.get("outputs"),
            }),
        )

        # If there is a config mapping, invoke it and get the descendent solids
        # config that way. Else just grabs the solids entry of the current config
        solids_dict = (_get_mapped_solids_dict(
            solid, graph_def, current_stack, current_solid_config,
            resource_defs) if graph_def.config_mapping else
                       current_solid_config.get("solids", {}))

        yield from _composite_descent(current_stack, solids_dict,
                                      resource_defs)
Ejemplo n.º 7
0
    def __new__(
        cls,
        name: Optional[str] = None,
        resource_defs: Optional[Dict[str, ResourceDefinition]] = None,
        logger_defs: Optional[Dict[str, LoggerDefinition]] = None,
        executor_defs: Optional[List[ExecutorDefinition]] = None,
        description: Optional[str] = None,
        _config_mapping: Optional[ConfigMapping] = None,
        _partitioned_config: Optional["PartitionedConfig"] = None,
    ):

        from .partition import PartitionedConfig

        resource_defs = check.opt_dict_param(resource_defs,
                                             "resource_defs",
                                             key_type=str,
                                             value_type=ResourceDefinition)

        for key in resource_defs:
            if not key.isidentifier():
                check.failed(
                    f"Resource key '{key}' must be a valid Python identifier.")

        if resource_defs and "io_manager" in resource_defs:
            resource_defs_with_defaults = resource_defs
        else:
            from dagster.core.storage.mem_io_manager import mem_io_manager

            resource_defs_with_defaults = merge_dicts(
                {"io_manager": mem_io_manager}, resource_defs or {})

        return super(ModeDefinition, cls).__new__(
            cls,
            name=check_valid_name(name) if name else DEFAULT_MODE_NAME,
            resource_defs=resource_defs_with_defaults,
            loggers=(check.opt_dict_param(logger_defs,
                                          "logger_defs",
                                          key_type=str,
                                          value_type=LoggerDefinition)
                     or default_loggers()),
            executor_defs=check.list_param(
                executor_defs if executor_defs else default_executors,
                "executor_defs",
                of_type=ExecutorDefinition,
            ),
            description=check.opt_str_param(description, "description"),
            config_mapping=check.opt_inst_param(_config_mapping,
                                                "_config_mapping",
                                                ConfigMapping),
            partitioned_config=check.opt_inst_param(_partitioned_config,
                                                    "_partitioned_config",
                                                    PartitionedConfig),
        )
Ejemplo n.º 8
0
def test_merge():
    # two element merge
    assert merge_dicts({}, {}) == {}
    assert merge_dicts({1: 2}, {}) == {1: 2}
    assert merge_dicts({}, {1: 2}) == {1: 2}
    assert merge_dicts({1: 1}, {1: 2}) == {1: 2}

    # three element merge
    assert merge_dicts({}, {}, {}) == {}
    assert merge_dicts({1: 2}, {2: 3}, {3: 4}) == {1: 2, 2: 3, 3: 4}
    assert merge_dicts({1: 2}, {1: 3}, {1: 4}) == {1: 4}
Ejemplo n.º 9
0
def test_execute_on_celery_k8s_with_env_var_and_termination(  # pylint: disable=redefined-outer-name
        dagster_docker_image, dagster_instance,
        set_dagster_k8s_pipeline_run_namespace_env, dagit_url):
    run_config = merge_dicts(
        merge_yamls([
            os.path.join(get_test_project_environments_path(), "env_s3.yaml"),
        ]),
        get_celery_engine_config(
            dagster_docker_image=dagster_docker_image,
            job_namespace={"env": "DAGSTER_K8S_PIPELINE_RUN_NAMESPACE"},
        ),
    )

    _test_termination(dagit_url, dagster_instance, run_config)
Ejemplo n.º 10
0
def test_execute_on_celery_k8s_with_termination(  # pylint: disable=redefined-outer-name
    dagster_docker_image,
    dagster_instance,
    helm_namespace,
    dagit_url,
):
    run_config = merge_dicts(
        merge_yamls([
            os.path.join(get_test_project_environments_path(), "env_s3.yaml"),
        ]),
        get_celery_engine_config(dagster_docker_image=dagster_docker_image,
                                 job_namespace=helm_namespace),
    )

    _test_termination(dagit_url, dagster_instance, run_config)
Ejemplo n.º 11
0
    def __new__(
        cls,
        name=None,
        resource_defs=None,
        logger_defs=None,
        executor_defs=None,
        description=None,
        intermediate_storage_defs=None,
    ):
        from dagster.core.storage.system_storage import default_intermediate_storage_defs

        from .intermediate_storage import IntermediateStorageDefinition

        check.opt_dict_param(resource_defs,
                             "resource_defs",
                             key_type=str,
                             value_type=ResourceDefinition)
        if resource_defs and "io_manager" in resource_defs:
            resource_defs_with_defaults = resource_defs
        else:
            from dagster.core.storage.mem_io_manager import mem_io_manager

            resource_defs_with_defaults = merge_dicts(
                {"io_manager": mem_io_manager}, resource_defs or {})

        return super(ModeDefinition, cls).__new__(
            cls,
            name=check_valid_name(name) if name else DEFAULT_MODE_NAME,
            resource_defs=resource_defs_with_defaults,
            loggers=(check.opt_dict_param(logger_defs,
                                          "logger_defs",
                                          key_type=str,
                                          value_type=LoggerDefinition)
                     or default_loggers()),
            intermediate_storage_defs=check.list_param(
                intermediate_storage_defs if intermediate_storage_defs else
                default_intermediate_storage_defs,
                "intermediate_storage_defs",
                of_type=IntermediateStorageDefinition,
            ),
            executor_defs=check.list_param(
                executor_defs if executor_defs else default_executors,
                "executor_defs",
                of_type=ExecutorDefinition,
            ),
            description=check.opt_str_param(description, "description"),
        )
Ejemplo n.º 12
0
def get_celery_engine_config(dagster_docker_image, job_namespace):
    return {
        "execution": {
            "celery-k8s": {
                "config":
                merge_dicts(
                    ({
                        "job_image": dagster_docker_image,
                    } if dagster_docker_image else {}),
                    {
                        "job_namespace": job_namespace,
                        "image_pull_policy": image_pull_policy(),
                    },
                )
            }
        },
    }
Ejemplo n.º 13
0
def test_execute_on_celery_k8s_retry_pipeline(  # pylint: disable=redefined-outer-name
        dagster_docker_image, dagster_instance, helm_namespace, dagit_url):
    run_config = merge_dicts(
        merge_yamls([
            os.path.join(get_test_project_environments_path(), "env_s3.yaml")
        ]),
        get_celery_engine_config(dagster_docker_image=dagster_docker_image,
                                 job_namespace=helm_namespace),
    )

    run_id = launch_run_over_graphql(dagit_url,
                                     run_config=run_config,
                                     pipeline_name="retry_pipeline")

    result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id,
                                           namespace=helm_namespace)

    assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result)

    stats = dagster_instance.get_run_stats(run_id)
    assert stats.steps_succeeded == 1

    assert DagsterEventType.STEP_START in [
        event.dagster_event.event_type
        for event in dagster_instance.all_logs(run_id)
        if event.is_dagster_event
    ]

    assert DagsterEventType.STEP_UP_FOR_RETRY in [
        event.dagster_event.event_type
        for event in dagster_instance.all_logs(run_id)
        if event.is_dagster_event
    ]

    assert DagsterEventType.STEP_RESTARTED in [
        event.dagster_event.event_type
        for event in dagster_instance.all_logs(run_id)
        if event.is_dagster_event
    ]

    assert DagsterEventType.STEP_SUCCESS in [
        event.dagster_event.event_type
        for event in dagster_instance.all_logs(run_id)
        if event.is_dagster_event
    ]
Ejemplo n.º 14
0
def test_execute_on_celery_k8s_with_resource_requirements(  # pylint: disable=redefined-outer-name
        dagster_docker_image, dagster_instance, helm_namespace, dagit_url):
    run_config = merge_dicts(
        merge_yamls([
            os.path.join(get_test_project_environments_path(), "env_s3.yaml"),
        ]),
        get_celery_engine_config(dagster_docker_image=dagster_docker_image,
                                 job_namespace=helm_namespace),
    )

    run_id = launch_run_over_graphql(dagit_url,
                                     run_config=run_config,
                                     pipeline_name="resources_limit_pipeline")

    result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id,
                                           namespace=helm_namespace)

    assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result)
Ejemplo n.º 15
0
def test_docker_executor():
    """
    Note that this test relies on having AWS credentials in the environment.
    """

    executor_config = {
        "execution": {
            "docker": {
                "config": {
                    "networks": ["container:test-postgres-db-docker"],
                    "env_vars": [
                        "AWS_ACCESS_KEY_ID",
                        "AWS_SECRET_ACCESS_KEY",
                    ],
                }
            }
        }
    }

    docker_image = get_test_project_docker_image()
    if IS_BUILDKITE:
        executor_config["execution"]["docker"]["config"][
            "registry"
        ] = get_buildkite_registry_config()
    else:
        find_local_test_image(docker_image)

    run_config = merge_dicts(
        merge_yamls(
            [
                os.path.join(get_test_project_environments_path(), "env.yaml"),
                os.path.join(get_test_project_environments_path(), "env_s3.yaml"),
            ]
        ),
        executor_config,
    )

    with environ({"DOCKER_LAUNCHER_NETWORK": "container:test-postgres-db-docker"}):
        with docker_postgres_instance() as instance:
            recon_pipeline = get_test_project_recon_pipeline("demo_pipeline_docker", docker_image)
            assert execute_pipeline(
                recon_pipeline, run_config=run_config, instance=instance
            ).success
Ejemplo n.º 16
0
def get_celery_job_engine_config(dagster_docker_image,
                                 job_namespace,
                                 include_dagster_pipeline_env=False):
    return {
        "execution": {
            "config":
            merge_dicts(
                ({
                    "job_image": dagster_docker_image,
                } if dagster_docker_image else {}),
                {
                    "job_namespace": job_namespace,
                    "image_pull_policy": image_pull_policy(),
                },
                ({
                    "env_config_maps": ["dagster-pipeline-env"]
                } if include_dagster_pipeline_env else {}),
            )
        },
    }
Ejemplo n.º 17
0
def test_execute_subset_on_celery_k8s(  # pylint: disable=redefined-outer-name
        dagster_docker_image, helm_namespace, dagit_url):
    run_config = merge_dicts(
        merge_yamls([
            os.path.join(get_test_project_environments_path(),
                         "env_subset.yaml"),
            os.path.join(get_test_project_environments_path(), "env_s3.yaml"),
        ]),
        get_celery_engine_config(dagster_docker_image=dagster_docker_image,
                                 job_namespace=helm_namespace),
    )

    run_id = launch_run_over_graphql(
        dagit_url,
        run_config=run_config,
        pipeline_name="demo_pipeline_celery",
        solid_selection=["count_letters"],
    )

    result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id,
                                           namespace=helm_namespace)

    assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result)
Ejemplo n.º 18
0
    def cli(self, command: str, **kwargs) -> DbtCliOutput:
        """
        Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the
            default flags that were configured on resource initialization (if any) overriding the
            default values if necessary.

        Args:
            command (str): The command you wish to run (e.g. 'run', 'test', 'docs generate', etc.)

        Returns:
            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing
                parsed log output as well as the contents of run_results.json (if applicable).
        """
        command = check.str_param(command, "command")
        extra_flags = {} if kwargs is None else kwargs

        # remove default flags that are declared as "strict" and not explicitly passed in
        default_flags = {
            k: v
            for k, v in self.default_flags.items()
            if not (k in self.strict_flags and k not in extra_flags)
        }

        flags = merge_dicts(
            default_flags,
            self._format_params(extra_flags, replace_underscores=True))

        return execute_cli(
            executable=self._executable,
            command=command,
            flags_dict=flags,
            log=self.logger,
            warn_error=self._warn_error,
            ignore_handled_error=self._ignore_handled_error,
            target_path=self._target_path,
        )
Ejemplo n.º 19
0
def _launch_scheduled_execution(instance, schedule_def, pipeline, tick,
                                stream):
    pipeline_def = pipeline.get_definition()

    # Run should_execute and halt if it returns False
    schedule_context = ScheduleExecutionContext(instance)
    with user_code_error_boundary(
            ScheduleExecutionError,
            lambda:
            'Error occurred during the execution of should_execute for schedule '
            '{schedule_name}'.format(schedule_name=schedule_def.name),
    ):
        should_execute = schedule_def.should_execute(schedule_context)

    if not should_execute:
        # Update tick to skipped state and return
        tick.update_with_status(ScheduleTickStatus.SKIPPED)
        stream.send(ScheduledExecutionSkipped())
        return

    errors = []

    run_config = {}
    schedule_tags = {}
    try:
        with user_code_error_boundary(
                ScheduleExecutionError,
                lambda:
                'Error occurred during the execution of run_config_fn for schedule '
                '{schedule_name}'.format(schedule_name=schedule_def.name),
        ):
            run_config = schedule_def.get_run_config(schedule_context)
    except DagsterUserCodeExecutionError:
        error_data = serializable_error_info_from_exc_info(sys.exc_info())
        errors.append(error_data)

    try:
        with user_code_error_boundary(
                ScheduleExecutionError,
                lambda:
                'Error occurred during the execution of tags_fn for schedule '
                '{schedule_name}'.format(schedule_name=schedule_def.name),
        ):
            schedule_tags = schedule_def.get_tags(schedule_context)
    except DagsterUserCodeExecutionError:
        error_data = serializable_error_info_from_exc_info(sys.exc_info())
        errors.append(error_data)

    pipeline_tags = pipeline_def.tags or {}
    check_tags(pipeline_tags, 'pipeline_tags')
    tags = merge_dicts(pipeline_tags, schedule_tags)

    mode = schedule_def.mode

    execution_plan_snapshot = None
    try:
        execution_plan = create_execution_plan(
            pipeline_def,
            run_config=run_config,
            mode=mode,
        )
        execution_plan_snapshot = snapshot_from_execution_plan(
            execution_plan, pipeline_def.get_pipeline_snapshot_id())
    except DagsterInvalidConfigError:
        error_data = serializable_error_info_from_exc_info(sys.exc_info())
        errors.append(error_data)

    # Enter the run in the DB with the information we have
    possibly_invalid_pipeline_run = instance.create_run(
        pipeline_name=schedule_def.pipeline_name,
        run_id=None,
        run_config=run_config,
        mode=mode,
        solids_to_execute=pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=pipeline.solid_selection,
        status=None,
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(),
    )

    tick.update_with_status(ScheduleTickStatus.SUCCESS,
                            run_id=possibly_invalid_pipeline_run.run_id)

    # If there were errors, inject them into the event log and fail the run
    if len(errors) > 0:
        for error in errors:
            instance.report_engine_event(
                error.message,
                possibly_invalid_pipeline_run,
                EngineEventData.engine_error(error),
            )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id, errors=errors))
        return

    # Otherwise the run should be valid so lets launch it

    # Need an ExternalPipeline to launch so make one here
    recon_repo = pipeline.get_reconstructable_repository()
    repo_location = InProcessRepositoryLocation(recon_repo)
    external_pipeline = repo_location.get_repository(
        recon_repo.get_definition().name).get_full_external_pipeline(
            pipeline_def.name)

    try:
        launched_run = instance.launch_run(
            possibly_invalid_pipeline_run.run_id, external_pipeline)
    except DagsterLaunchFailedError:
        error = serializable_error_info_from_exc_info(sys.exc_info())
        instance.report_engine_event(
            error.message,
            possibly_invalid_pipeline_run,
            EngineEventData.engine_error(error),
        )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id, errors=[error]))
        return

    stream.send(ScheduledExecutionSuccess(run_id=launched_run.run_id))
    return
Ejemplo n.º 20
0
def secretsmanager_secrets_resource(context):
    """Resource that provides a dict which maps selected SecretsManager secrets to
    their string values. Also optionally sets chosen secrets as environment variables.

    Example:

        .. code-block:: python

            import os
            from dagster import build_op_context, job, op
            from dagster_aws.secretsmanager import secretsmanager_secrets_resource

            @op(required_resource_keys={'secrets'})
            def example_secretsmanager_secrets_op(context):
                return context.resources.secrets.get("my-secret-name")

            @op(required_resource_keys={'secrets'})
            def example_secretsmanager_secrets_op_2(context):
                return os.getenv("my-other-secret-name")

            @job(resource_defs={'secrets': secretsmanager_secrets_resource})
            def example_job(context):
                example_secretsmanager_secrets_op()
                example_secretsmanager_secrets_op_2()

            example_job.execute_in_process(
                run_config={
                    'resources': {
                        'secrets': {
                            'config': {
                                'region_name': 'us-west-1',
                                'secrets_tag': 'dagster',
                                'add_to_environment': True,
                            }
                        }
                    }
                }
            )

    Note that your ops must also declare that they require this resource with
    `required_resource_keys`, or it will not be initialized for the execution of their compute
    functions.

    You may configure this resource as follows:

    .. code-block:: YAML

        resources:
          secretsmanager:
            config:
              region_name: "us-west-1"
              # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen
              # through the ordinary boto credential chain.
              profile_name: "dev"
              # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default
              # profile as specified in ~/.aws/credentials file
              secrets: ["arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf"]
              # Optional[List[str]]: Specifies a list of secret ARNs to pull from SecretsManager.
              secrets_tag: "dagster"
              # Optional[str]: Specifies a tag, all secrets which have the tag set will be pulled
              # from SecretsManager.
              add_to_environment: true
              # Optional[bool]: Whether to set the selected secrets as environment variables. Defaults
              # to false.

    """
    add_to_environment = check.bool_param(
        context.resource_config["add_to_environment"], "add_to_environment")
    secrets_tag = check.opt_str_param(context.resource_config["secrets_tag"],
                                      "secrets_tag")
    secrets = check.list_param(context.resource_config["secrets"],
                               "secrets",
                               of_type=str)

    secrets_manager = construct_secretsmanager_client(
        max_attempts=context.resource_config["max_attempts"],
        region_name=context.resource_config.get("region_name"),
        profile_name=context.resource_config.get("profile_name"),
    )

    secret_arns = merge_dicts(
        (get_tagged_secrets(secrets_manager, [secrets_tag])
         if secrets_tag else {}),
        get_secrets_from_arns(secrets_manager, secrets),
    )

    secrets_map = {
        name:
        secrets_manager.get_secret_value(SecretId=arn).get("SecretString")
        for name, arn in secret_arns.items()
    }
    with environ(secrets_map if add_to_environment else {}):
        yield secrets_map
Ejemplo n.º 21
0
@resource(
    merge_dicts(
        SECRETSMANAGER_SESSION_CONFIG,
        {
            "secrets":
            Field(
                Array(str),
                is_required=False,
                default_value=[],
                description=(
                    "An array of AWS Secrets Manager secrets arns to fetch."),
            ),
            "secrets_tag":
            Field(
                Noneable(str),
                is_required=False,
                default_value=None,
                description=
                ("AWS Secrets Manager secrets with this tag will be fetched and made available."
                 ),
            ),
            "add_to_environment":
            Field(
                bool,
                is_required=False,
                default_value=False,
                description=(
                    "Whether to mount the secrets as environment variables."),
            ),
        },
    ))
@contextmanager
Ejemplo n.º 22
0
              credential:
                sas: my_sas_token
                # str: the SAS token for the account.
                key:
                  env: AZURE_DATA_LAKE_STORAGE_KEY
                # str: The shared access key for the account.
    '''
    return _adls2_resource_from_config(context.resource_config)


@resource(
    merge_dicts(
        ADLS2_CLIENT_CONFIG,
        {
            'adls2_file_system':
            Field(StringSource, description='ADLS Gen2 file system name'),
            'adls2_prefix':
            Field(StringSource, is_required=False, default_value='dagster'),
        },
    ))
def adls2_file_manager(context):
    adls2_client = _adls2_resource_from_config(
        context.resource_config).adls2_client

    return ADLS2FileManager(
        adls2_client=adls2_client,
        file_system=context.resource_config['adls2_file_system'],
        prefix=context.resource_config['adls2_prefix'],
    )

Ejemplo n.º 23
0
def test_container_context_on_pipeline():
    docker_image = get_test_project_docker_image()

    launcher_config = {}

    if IS_BUILDKITE:
        launcher_config["registry"] = get_buildkite_registry_config()
    else:
        find_local_test_image(docker_image)

    executor_config = {
        "execution": {
            "docker": {
                "config": {}
            }
        },
    }

    run_config = merge_dicts(
        merge_yamls([
            os.path.join(get_test_project_environments_path(), "env.yaml"),
            os.path.join(get_test_project_environments_path(), "env_s3.yaml"),
        ]),
        executor_config,
    )

    with docker_postgres_instance(
            overrides={
                "run_launcher": {
                    "class": "DockerRunLauncher",
                    "module": "dagster_docker",
                    "config": launcher_config,
                }
            }) as instance:
        recon_pipeline = get_test_project_recon_pipeline(
            "demo_pipeline_docker",
            docker_image,
            container_context={
                "docker": {
                    "env_vars": [
                        "AWS_ACCESS_KEY_ID",
                        "AWS_SECRET_ACCESS_KEY",
                    ],
                    "networks": ["container:test-postgres-db-docker"],
                    "container_kwargs": {
                        "auto_remove": True,
                        "volumes":
                        ["/var/run/docker.sock:/var/run/docker.sock"],
                    },
                }
            },
        )
        with get_test_project_workspace_and_external_pipeline(
                instance, "demo_pipeline_docker",
                container_image=docker_image) as (
                    workspace,
                    orig_pipeline,
                ):
            external_pipeline = ReOriginatedExternalPipelineForTest(
                orig_pipeline, container_image=docker_image)

            run = instance.create_run_for_pipeline(
                pipeline_def=recon_pipeline.get_definition(),
                run_config=run_config,
                external_pipeline_origin=external_pipeline.get_external_origin(
                ),
                pipeline_code_origin=recon_pipeline.get_python_origin(),
            )

            instance.launch_run(run.run_id, workspace)

            poll_for_finished_run(instance, run.run_id, timeout=60)

            for log in instance.all_logs(run.run_id):
                print(log)  # pylint: disable=print-call

            assert instance.get_run_by_id(
                run.run_id).status == PipelineRunStatus.SUCCESS
Ejemplo n.º 24
0
}


@resource(
    GCS_CLIENT_CONFIG,
    description="This resource provides a GCS client",
)
def gcs_resource(init_context):
    return _gcs_client_from_config(init_context.resource_config)


@resource(
    merge_dicts(
        GCS_CLIENT_CONFIG,
        {
            "gcs_bucket": Field(StringSource),
            "gcs_prefix": Field(StringSource, is_required=False, default_value="dagster"),
        },
    )
)
def gcs_file_manager(context):
    """FileManager that provides abstract access to GCS.

    Implements the :py:class:`~dagster.core.storage.file_manager.FileManager` API.
    """
    gcs_client = _gcs_client_from_config(context.resource_config)
    return GCSFileManager(
        client=gcs_client,
        gcs_bucket=context.resource_config["gcs_bucket"],
        gcs_base_key=context.resource_config["gcs_prefix"],
    )
Ejemplo n.º 25
0
              credential:
                sas: my_sas_token
                # str: the SAS token for the account.
                key:
                  env: AZURE_DATA_LAKE_STORAGE_KEY
                # str: The shared access key for the account.
    """
    return _adls2_resource_from_config(context.resource_config)


@resource(
    merge_dicts(
        ADLS2_CLIENT_CONFIG,
        {
            "adls2_file_system":
            Field(StringSource, description="ADLS Gen2 file system name"),
            "adls2_prefix":
            Field(StringSource, is_required=False, default_value="dagster"),
        },
    ))
def adls2_file_manager(context):
    adls2_client = _adls2_resource_from_config(
        context.resource_config).adls2_client

    return ADLS2FileManager(
        adls2_client=adls2_client,
        file_system=context.resource_config["adls2_file_system"],
        prefix=context.resource_config["adls2_prefix"],
    )

Ejemplo n.º 26
0
def test_docker_monitoring():
    docker_image = get_test_project_docker_image()

    launcher_config = {
        "env_vars": [
            "AWS_ACCESS_KEY_ID",
            "AWS_SECRET_ACCESS_KEY",
        ],
        "networks": ["container:test-postgres-db-docker"],
        "container_kwargs": {
            # "auto_remove": True,
            "volumes": ["/var/run/docker.sock:/var/run/docker.sock"],
        },
    }

    if IS_BUILDKITE:
        launcher_config["registry"] = get_buildkite_registry_config()
    else:
        find_local_test_image(docker_image)

    run_config = merge_dicts(
        load_yaml_from_path(
            os.path.join(get_test_project_environments_path(), "env_s3.yaml")),
        {
            "solids": {
                "multiply_the_word_slow": {
                    "inputs": {
                        "word": "bar"
                    },
                    "config": {
                        "factor": 2,
                        "sleep_time": 20
                    },
                }
            },
            "execution": {
                "docker": {
                    "config": {}
                }
            },
        },
    )

    with docker_postgres_instance({
            "run_monitoring": {
                "enabled": True
            },
            "run_launcher": {
                "class": "DockerRunLauncher",
                "module": "dagster_docker",
                "config": launcher_config,
            },
    }) as instance:
        recon_pipeline = get_test_project_recon_pipeline(
            "demo_pipeline_docker_slow", docker_image)
        with get_test_project_workspace_and_external_pipeline(
                instance,
                "demo_pipeline_docker_slow",
                container_image=docker_image) as (
                    workspace,
                    orig_pipeline,
                ):
            with start_daemon():
                external_pipeline = ReOriginatedExternalPipelineForTest(
                    orig_pipeline, container_image=docker_image)

                run = instance.create_run_for_pipeline(
                    pipeline_def=recon_pipeline.get_definition(),
                    run_config=run_config,
                    external_pipeline_origin=external_pipeline.
                    get_external_origin(),
                    pipeline_code_origin=external_pipeline.get_python_origin(),
                )

                with log_run_events(instance, run.run_id):

                    instance.launch_run(run.run_id, workspace)

                    start_time = time.time()
                    while time.time() - start_time < 60:
                        run = instance.get_run_by_id(run.run_id)
                        if run.status == PipelineRunStatus.STARTED:
                            break
                        assert run.status == PipelineRunStatus.STARTING
                        time.sleep(1)

                    time.sleep(3)

                    instance.run_launcher._get_container(  # pylint:disable=protected-access
                        instance.get_run_by_id(run.run_id)).stop()

                    # daemon resumes the run
                    poll_for_finished_run(instance, run.run_id, timeout=90)
                    assert instance.get_run_by_id(
                        run.run_id).status == PipelineRunStatus.SUCCESS
Ejemplo n.º 27
0
def _launch_scheduled_execution(instance, repo_location, external_repo,
                                external_schedule, tick, stream):
    pipeline_selector = PipelineSelector(
        location_name=repo_location.name,
        repository_name=external_repo.name,
        pipeline_name=external_schedule.pipeline_name,
        solid_selection=external_schedule.solid_selection,
    )

    subset_pipeline_result = repo_location.get_subset_external_pipeline_result(
        pipeline_selector)
    external_pipeline = ExternalPipeline(
        subset_pipeline_result.external_pipeline_data,
        external_repo.handle,
    )

    schedule_execution_data = repo_location.get_external_schedule_execution_data(
        instance=instance,
        repository_handle=external_repo.handle,
        schedule_name=external_schedule.name,
        schedule_execution_data_mode=ScheduleExecutionDataMode.
        LAUNCH_SCHEDULED_EXECUTION,
        scheduled_execution_time=
        None,  # No way to know this in general for this scheduler
    )

    run_config = {}
    schedule_tags = {}
    execution_plan_snapshot = None
    errors = []

    if isinstance(schedule_execution_data, ExternalScheduleExecutionErrorData):
        error = schedule_execution_data.error
        tick.update_with_status(ScheduleTickStatus.FAILURE, error=error)
        stream.send(ScheduledExecutionFailed(run_id=None, errors=[error]))
        return
    elif not schedule_execution_data.should_execute:
        # Update tick to skipped state and return
        tick.update_with_status(ScheduleTickStatus.SKIPPED)
        stream.send(ScheduledExecutionSkipped())
        return
    else:
        run_config = schedule_execution_data.run_config
        schedule_tags = schedule_execution_data.tags
        try:
            external_execution_plan = repo_location.get_external_execution_plan(
                external_pipeline,
                run_config,
                external_schedule.mode,
                step_keys_to_execute=None,
            )
            execution_plan_snapshot = external_execution_plan.execution_plan_snapshot
        except DagsterSubprocessError as e:
            errors.extend(e.subprocess_error_infos)
        except Exception as e:  # pylint: disable=broad-except
            errors.append(serializable_error_info_from_exc_info(
                sys.exc_info()))

    pipeline_tags = external_pipeline.tags or {}
    check_tags(pipeline_tags, "pipeline_tags")
    tags = merge_dicts(pipeline_tags, schedule_tags)

    # Enter the run in the DB with the information we have
    possibly_invalid_pipeline_run = instance.create_run(
        pipeline_name=external_schedule.pipeline_name,
        run_id=None,
        run_config=run_config,
        mode=external_schedule.mode,
        solids_to_execute=external_pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=external_pipeline.solid_selection,
        status=None,
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
    )

    tick.update_with_status(ScheduleTickStatus.SUCCESS,
                            run_id=possibly_invalid_pipeline_run.run_id)

    # If there were errors, inject them into the event log and fail the run
    if len(errors) > 0:
        for error in errors:
            instance.report_engine_event(
                error.message,
                possibly_invalid_pipeline_run,
                EngineEventData.engine_error(error),
            )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id, errors=errors))
        return

    try:
        launched_run = instance.launch_run(
            possibly_invalid_pipeline_run.run_id, external_pipeline)
    except Exception:  # pylint: disable=broad-except
        stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id, errors=[error]))
        return

    stream.send(ScheduledExecutionSuccess(run_id=launched_run.run_id))
    return
Ejemplo n.º 28
0
def _launch_run(instance, repo_location, external_schedule, external_pipeline,
                tick_context, run_request):
    run_config = run_request.run_config
    schedule_tags = run_request.tags

    execution_plan_snapshot = None
    errors = []
    try:
        external_execution_plan = repo_location.get_external_execution_plan(
            external_pipeline,
            run_config,
            external_schedule.mode,
            step_keys_to_execute=None,
        )
        execution_plan_snapshot = external_execution_plan.execution_plan_snapshot
    except DagsterSubprocessError as e:
        errors.extend(e.subprocess_error_infos)
    except Exception as e:  # pylint: disable=broad-except
        errors.append(serializable_error_info_from_exc_info(sys.exc_info()))

    pipeline_tags = external_pipeline.tags or {}
    check_tags(pipeline_tags, "pipeline_tags")
    tags = merge_dicts(pipeline_tags, schedule_tags)

    # Enter the run in the DB with the information we have
    possibly_invalid_pipeline_run = instance.create_run(
        pipeline_name=external_schedule.pipeline_name,
        run_id=None,
        run_config=run_config,
        mode=external_schedule.mode,
        solids_to_execute=external_pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=external_pipeline.solid_selection,
        status=None,
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
        external_pipeline_origin=external_pipeline.get_external_origin(),
    )

    tick_context.add_run(run_id=possibly_invalid_pipeline_run.run_id,
                         run_key=run_request.run_key)

    # If there were errors, inject them into the event log and fail the run
    if len(errors) > 0:
        for error in errors:
            instance.report_engine_event(
                error.message,
                possibly_invalid_pipeline_run,
                EngineEventData.engine_error(error),
            )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        tick_context.stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id, errors=errors))
        return

    try:
        launched_run = instance.submit_run(
            possibly_invalid_pipeline_run.run_id, external_pipeline)
    except Exception:  # pylint: disable=broad-except
        tick_context.stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id,
                errors=[serializable_error_info_from_exc_info(sys.exc_info())],
            ))
        return

    tick_context.stream.send(
        ScheduledExecutionSuccess(run_id=launched_run.run_id))
Ejemplo n.º 29
0
def start_scheduled_execution(graphene_info, schedule_name):
    '''
    When a scheduler ticks and needs to run for a given schedule, it issues a
    START_SCHEDULED_EXECUTION mutation with just the schedule name. The mutation is
    resolved entirely by this method.
    '''

    check.inst_param(graphene_info, 'graphene_info', ResolveInfo)
    check.str_param(schedule_name, 'schedule_name')

    tick = None
    try:
        # We first load the repository and schedule definition to create
        # and store a ScheduleTick.
        # If this fails, this error should be sent to the file based scheduler logs.
        external_repository = graphene_info.context.get_external_repository()
        repository_name = external_repository.name
        schedule_def = get_dagster_schedule_def(graphene_info, schedule_name)
        cron_schedule = "Unknown" if not schedule_def else schedule_def.cron_schedule
        tick = graphene_info.context.instance.create_schedule_tick(
            repository_name,
            ScheduleTickData(
                schedule_name=schedule_name,
                cron_schedule=cron_schedule,
                timestamp=time.time(),
                status=ScheduleTickStatus.STARTED,
            ),
        )

        # Run should_execute and halt if it returns False
        schedule_context = ScheduleExecutionContext(
            graphene_info.context.instance)
        with user_code_error_boundary(
                ScheduleExecutionError,
                lambda:
                'Error occurred during the execution should_execute for schedule '
                '{schedule_name}'.format(schedule_name=schedule_def.name),
        ):
            should_execute = schedule_def.should_execute(schedule_context)

        if not should_execute:
            # Update tick to skipped state and return
            tick = tick.with_status(ScheduleTickStatus.SKIPPED)
            graphene_info.context.instance.update_schedule_tick(
                repository_name, tick)
            # Return skipped specific gql response
            return graphene_info.schema.type_named(
                'ScheduledExecutionBlocked'
            )(message=
              'Schedule {schedule_name} did not run because the should_execute did not return'
              ' True'.format(schedule_name=schedule_name))

        errors = []

        environment_dict = {}
        schedule_tags = {}
        try:
            with user_code_error_boundary(
                    ScheduleExecutionError,
                    lambda:
                    'Error occurred during the execution of environment_dict_fn for schedule '
                    '{schedule_name}'.format(schedule_name=schedule_def.name),
            ):
                environment_dict = schedule_def.get_environment_dict(
                    schedule_context)
        except DagsterUserCodeExecutionError as exc:
            error_data = serializable_error_info_from_exc_info(sys.exc_info())
            errors.append(error_data)

        try:
            with user_code_error_boundary(
                    ScheduleExecutionError,
                    lambda:
                    'Error occurred during the execution of tags_fn for schedule '
                    '{schedule_name}'.format(schedule_name=schedule_def.name),
            ):
                schedule_tags = schedule_def.get_tags(schedule_context)
        except DagsterUserCodeExecutionError:
            error_data = serializable_error_info_from_exc_info(sys.exc_info())
            errors.append(error_data)

        external_pipeline = get_external_pipeline_or_raise(
            graphene_info, schedule_def.selector.name,
            schedule_def.selector.solid_subset)
        pipeline_tags = external_pipeline.tags or {}
        check_tags(pipeline_tags, 'pipeline_tags')
        tags = merge_dicts(pipeline_tags, schedule_tags)

        selector = schedule_def.selector
        mode = schedule_def.mode

        execution_params = ExecutionParams(
            selector=selector,
            environment_dict=environment_dict,
            mode=mode,
            execution_metadata=ExecutionMetadata(tags=tags, run_id=None),
            step_keys=None,
        )

        run, result = _execute_schedule(graphene_info, external_pipeline,
                                        execution_params, errors)
        graphene_info.context.instance.update_schedule_tick(
            repository_name,
            tick.with_status(ScheduleTickStatus.SUCCESS, run_id=run.run_id),
        )

        return result

    except Exception as exc:  # pylint: disable=broad-except
        error_data = serializable_error_info_from_exc_info(sys.exc_info())

        if tick:
            graphene_info.context.instance.update_schedule_tick(
                repository_name,
                tick.with_status(ScheduleTickStatus.FAILURE, error=error_data),
            )

        raise exc
Ejemplo n.º 30
0
    def __new__(
        cls,
        name=None,
        resource_defs=None,
        logger_defs=None,
        system_storage_defs=None,
        executor_defs=None,
        description=None,
        intermediate_storage_defs=None,
    ):
        from dagster.core.storage.system_storage import (
            default_system_storage_defs,
            default_intermediate_storage_defs,
        )

        from .system_storage import SystemStorageDefinition
        from .intermediate_storage import IntermediateStorageDefinition

        if system_storage_defs is not None and intermediate_storage_defs is None:
            warnings.warn(
                "system_storage_defs are deprecated and will be removed in 0.10.0 "
                "and should be replaced with "
                "intermediate_storage_defs for intermediates and resource_defs for files"
            )

        check.opt_dict_param(resource_defs,
                             "resource_defs",
                             key_type=str,
                             value_type=ResourceDefinition)
        if resource_defs and "asset_store" in resource_defs:
            resource_defs_with_defaults = resource_defs
        else:
            from dagster.core.storage.asset_store import mem_asset_store

            resource_defs_with_defaults = merge_dicts(
                {"asset_store": mem_asset_store}, resource_defs or {})

        return super(ModeDefinition, cls).__new__(
            cls,
            name=check_valid_name(name) if name else DEFAULT_MODE_NAME,
            resource_defs=resource_defs_with_defaults,
            loggers=(check.opt_dict_param(logger_defs,
                                          "logger_defs",
                                          key_type=str,
                                          value_type=LoggerDefinition)
                     or default_loggers()),
            system_storage_defs=check.list_param(
                system_storage_defs
                if system_storage_defs else default_system_storage_defs,
                "system_storage_defs",
                of_type=SystemStorageDefinition,
            ),
            intermediate_storage_defs=check.list_param(
                intermediate_storage_defs if intermediate_storage_defs else
                default_intermediate_storage_defs,
                "intermediate_storage_defs",
                of_type=IntermediateStorageDefinition,
            ),
            executor_defs=check.list_param(
                executor_defs if executor_defs else default_executors,
                "executor_defs",
                of_type=ExecutorDefinition,
            ),
            description=check.opt_str_param(description, "description"),
        )