Example #1
0
 def __new__(cls, reg_set, frozen_set):
     set_param(reg_set, "reg_set")
     inst_param(frozen_set, "frozen_set", frozenset)
     return super(HasSets, cls).__new__(cls, reg_set, frozen_set)
Example #2
0
def _get_pipeline_subset_def(pipeline_def, solids_to_execute):
    """
    Build a pipeline which is a subset of another pipeline.
    Only includes the solids which are in solids_to_execute.
    """

    check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)
    check.set_param(solids_to_execute, "solids_to_execute", of_type=str)

    for solid_name in solids_to_execute:
        if not pipeline_def.has_solid_named(solid_name):
            raise DagsterInvalidSubsetError(
                "Pipeline {pipeline_name} has no solid named {name}.".format(
                    pipeline_name=pipeline_def.name, name=solid_name), )

    solids = list(map(pipeline_def.solid_named, solids_to_execute))
    deps = {_dep_key_of(solid): {} for solid in solids}

    for solid in solids:
        for input_handle in solid.input_handles():
            if pipeline_def.dependency_structure.has_singular_dep(
                    input_handle):
                output_handle = pipeline_def.dependency_structure.get_singular_dep(
                    input_handle)
                if output_handle.solid.name in solids_to_execute:
                    deps[_dep_key_of(solid)][
                        input_handle.input_def.name] = DependencyDefinition(
                            solid=output_handle.solid.name,
                            output=output_handle.output_def.name)
            elif pipeline_def.dependency_structure.has_multi_deps(
                    input_handle):
                output_handles = pipeline_def.dependency_structure.get_multi_deps(
                    input_handle)
                deps[_dep_key_of(solid)][
                    input_handle.input_def.name] = MultiDependencyDefinition([
                        DependencyDefinition(
                            solid=output_handle.solid.name,
                            output=output_handle.output_def.name)
                        for output_handle in output_handles
                        if output_handle.solid.name in solids_to_execute
                    ])

    try:
        sub_pipeline_def = PipelineSubsetDefinition(
            name=pipeline_def.
            name,  # should we change the name for subsetted pipeline?
            solid_defs=list({solid.definition
                             for solid in solids}),
            mode_defs=pipeline_def.mode_definitions,
            dependencies=deps,
            _parent_pipeline_def=pipeline_def,
            tags=pipeline_def.tags,
            hook_defs=pipeline_def.hook_defs,
        )

        return sub_pipeline_def
    except DagsterInvalidDefinitionError as exc:
        # This handles the case when you construct a subset such that an unsatisfied
        # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError,
        # we re-raise a DagsterInvalidSubsetError.
        raise DagsterInvalidSubsetError(
            f"The attempted subset {str_format_set(solids_to_execute)} for pipeline "
            f"{pipeline_def.name} results in an invalid pipeline") from exc
Example #3
0
def _validate_resource_dependencies(mode_definitions, node_defs,
                                    dagster_type_dict, solid_dict,
                                    pipeline_hook_defs):
    """This validation ensures that each pipeline context provides the resources that are required
    by each solid.
    """
    check.list_param(mode_definitions,
                     "mode_definitions",
                     of_type=ModeDefinition)
    check.list_param(node_defs, "node_defs", of_type=NodeDefinition)
    check.dict_param(dagster_type_dict, "dagster_type_dict")
    check.dict_param(solid_dict, "solid_dict")
    check.set_param(pipeline_hook_defs,
                    "pipeline_hook_defs",
                    of_type=HookDefinition)

    for mode_def in mode_definitions:
        mode_resources = set(mode_def.resource_defs.keys())
        for node_def in node_defs:
            for required_resource in node_def.required_resource_keys:
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError((
                        'Resource "{resource}" is required by solid def {node_def_name}, but is not '
                        'provided by mode "{mode_name}".').format(
                            resource=required_resource,
                            node_def_name=node_def.name,
                            mode_name=mode_def.name,
                        ))

        _validate_type_resource_deps_for_mode(mode_def, mode_resources,
                                              dagster_type_dict)

        for intermediate_storage in mode_def.intermediate_storage_defs or []:
            for required_resource in intermediate_storage.required_resource_keys:
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError((
                        "Resource '{resource}' is required by intermediate storage "
                        "'{storage_name}', but is not provided by mode '{mode_name}'."
                    ).format(
                        resource=required_resource,
                        storage_name=intermediate_storage.name,
                        mode_name=mode_def.name,
                    ))
        for solid in solid_dict.values():
            for hook_def in solid.hook_defs:
                for required_resource in hook_def.required_resource_keys:
                    if required_resource not in mode_resources:
                        raise DagsterInvalidDefinitionError((
                            'Resource "{resource}" is required by hook "{hook_name}", but is not '
                            'provided by mode "{mode_name}".').format(
                                resource=required_resource,
                                hook_name=hook_def.name,
                                mode_name=mode_def.name,
                            ))

        for hook_def in pipeline_hook_defs:
            for required_resource in hook_def.required_resource_keys:
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError((
                        'Resource "{resource}" is required by hook "{hook_name}", but is not '
                        'provided by mode "{mode_name}".').format(
                            resource=required_resource,
                            hook_name=hook_def.name,
                            mode_name=mode_def.name,
                        ))
Example #4
0
    def subset_for_execution_from_existing_pipeline(self, solids_to_execute):
        # take a frozenset of resolved solid names from an existing pipeline run
        # so there's no need to parse the selection
        check.set_param(solids_to_execute, "solids_to_execute", of_type=str)

        return self._subset_for_execution(solids_to_execute)
Example #5
0
def resource_initialization_event_generator(
    execution_plan, environment_config, pipeline_run, log_manager, resource_keys_to_init
):
    check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
    check.inst_param(environment_config, 'environment_config', EnvironmentConfig)
    check.inst_param(pipeline_run, 'pipeline_run', PipelineRun)
    check.inst_param(log_manager, 'log_manager', DagsterLogManager)
    check.set_param(resource_keys_to_init, 'resource_keys_to_init', of_type=str)

    if execution_plan.step_key_for_single_step_plans():
        step = execution_plan.get_step_by_key(execution_plan.step_key_for_single_step_plans())
        resource_log_manager = DagsterLogManager(
            pipeline_run.run_id,
            merge_dicts(log_manager.logging_tags, step.logging_tags),
            log_manager.loggers,
        )
    else:
        resource_log_manager = log_manager

    resource_instances = {}
    pipeline_def = execution_plan.pipeline_def
    mode_definition = pipeline_def.get_mode_definition(pipeline_run.mode)
    resource_managers = deque()
    generator_closed = False
    resource_init_times = {}

    try:
        if resource_keys_to_init:
            yield DagsterEvent.resource_init_start(
                execution_plan, resource_log_manager, resource_keys_to_init,
            )

        for resource_name, resource_def in sorted(mode_definition.resource_defs.items()):
            if not resource_name in resource_keys_to_init:
                continue
            resource_context = InitResourceContext(
                pipeline_def=pipeline_def,
                resource_def=resource_def,
                resource_config=environment_config.resources.get(resource_name, {}).get('config'),
                run_id=pipeline_run.run_id,
                log_manager=resource_log_manager,
            )
            manager = single_resource_generation_manager(
                resource_context, resource_name, resource_def
            )
            for event in manager.generate_setup_events():
                if event:
                    yield event
            initialized_resource = check.inst(manager.get_object(), InitializedResource)
            resource_instances[resource_name] = initialized_resource.resource
            resource_init_times[resource_name] = initialized_resource.duration
            resource_managers.append(manager)

        if resource_keys_to_init:
            yield DagsterEvent.resource_init_success(
                execution_plan, resource_log_manager, resource_instances, resource_init_times
            )
        yield ScopedResourcesBuilder(resource_instances)
    except GeneratorExit:
        # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed
        # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).
        generator_closed = True
        raise
    except DagsterUserCodeExecutionError as dagster_user_error:
        yield DagsterEvent.resource_init_failure(
            execution_plan,
            resource_log_manager,
            resource_keys_to_init,
            serializable_error_info_from_exc_info(dagster_user_error.original_exc_info),
        )
        raise dagster_user_error
    finally:
        if not generator_closed:
            error = None
            while len(resource_managers) > 0:
                manager = resource_managers.pop()
                try:
                    for event in manager.generate_teardown_events():
                        yield event
                except DagsterUserCodeExecutionError as dagster_user_error:
                    error = dagster_user_error
            if error:
                yield DagsterEvent.resource_teardown_failure(
                    execution_plan,
                    resource_log_manager,
                    resource_keys_to_init,
                    serializable_error_info_from_exc_info(error.original_exc_info),
                )
Example #6
0
def create_databricks_job_solid(
        name="databricks_job",
        num_inputs=1,
        description=None,
        required_resource_keys=frozenset(["databricks_client"]),
):
    """
    Creates a solid that launches a databricks job.

    As config, the solid accepts a blob of the form described in Databricks' job API:
    https://docs.databricks.com/dev-tools/api/latest/jobs.html.

    Returns:
        SolidDefinition: A solid definition.
    """
    check.str_param(name, "name")
    check.opt_str_param(description, "description")
    check.int_param(num_inputs, "num_inputs")
    check.set_param(required_resource_keys,
                    "required_resource_keys",
                    of_type=str)

    input_defs = [
        InputDefinition("input_" + str(i), Nothing) for i in range(num_inputs)
    ]

    @solid(
        name=name,
        description=description,
        config_schema={
            "job":
            Field(
                Permissive(),
                description=
                "Databricks job run configuration, in the form described in "
                "Databricks' job API: https://docs.databricks.com/dev-tools/api/latest/jobs.html",
            ),
            "poll_interval_sec":
            Field(
                float,
                description="Check whether the job is done at this interval.",
                default_value=10,
            ),
            "max_wait_time_sec":
            Field(
                float,
                description=
                "If the job is not complete after this length of time, raise an error.",
                default_value=(24 * 60 * 60),
            ),
        },
        input_defs=input_defs,
        output_defs=[OutputDefinition(Nothing)],
        required_resource_keys=required_resource_keys,
        tags={"kind": "databricks"},
    )
    def databricks_solid(context):
        job_config = context.solid_config["job"]
        databricks_client = context.resources.databricks_client
        run_id = databricks_client.submit_run(**job_config)

        context.log.info(
            "Launched databricks job with run id {run_id}. UI: {url}. Waiting to run to completion..."
            .format(run_id=run_id,
                    url=create_ui_url(databricks_client,
                                      context.solid_config)))
        wait_for_run_to_complete(
            databricks_client,
            context.log,
            run_id,
            context.solid_config["poll_interval_sec"],
            context.solid_config["max_wait_time_sec"],
        )

    return databricks_solid
Example #7
0
def test_set_param():
    assert check.set_param(set(), 'set_param') == set()
    assert check.set_param(frozenset(), 'set_param') == set()

    with pytest.raises(ParameterCheckError):
        check.set_param(None, 'set_param')

    with pytest.raises(ParameterCheckError):
        check.set_param('3u4', 'set_param')

    obj_set = {1}
    assert check.set_param(obj_set, 'set_param') == obj_set

    obj_set_two = {1, 1, 2}
    obj_set_two_deduped = {1, 2}
    assert check.set_param(obj_set_two, 'set_param') == obj_set_two_deduped
    assert check.set_param(obj_set_two, 'set_param',
                           of_type=int) == obj_set_two_deduped

    with pytest.raises(CheckError, match='Did you pass a class'):
        check.set_param({str}, 'set_param', of_type=int)

    with pytest.raises(CheckError, match='Member of set mismatches type'):
        check.set_param({'foo'}, 'set_param', of_type=int)
Example #8
0
 def with_hooks(self, hook_defs):
     hook_defs = check.set_param(hook_defs,
                                 "hook_defs",
                                 of_type=HookDefinition)
     return CallableSolidNode(self.solid_def, self.given_alias, self.tags,
                              hook_defs.union(self.hook_defs))
Example #9
0
def test_set_param():
    assert check.set_param(set(), "set_param") == set()
    assert check.set_param(frozenset(), "set_param") == set()

    with pytest.raises(ParameterCheckError):
        check.set_param(None, "set_param")

    with pytest.raises(ParameterCheckError):
        check.set_param("3u4", "set_param")

    obj_set = {1}
    assert check.set_param(obj_set, "set_param") == obj_set

    obj_set_two = {1, 1, 2}
    obj_set_two_deduped = {1, 2}
    assert check.set_param(obj_set_two, "set_param") == obj_set_two_deduped
    assert check.set_param(obj_set_two, "set_param", of_type=int) == obj_set_two_deduped

    with pytest.raises(CheckError, match="Did you pass a class"):
        check.set_param({str}, "set_param", of_type=int)

    with pytest.raises(CheckError, match="Member of set mismatches type"):
        check.set_param({"foo"}, "set_param", of_type=int)
Example #10
0
def _validate_resource_dependencies(mode_definitions, solid_defs, solid_dict, pipeline_hook_defs):
    """This validation ensures that each pipeline context provides the resources that are required
    by each solid.
    """
    check.list_param(mode_definitions, "mode_definitions", of_type=ModeDefinition)
    check.list_param(solid_defs, "solid_defs", of_type=ISolidDefinition)
    check.set_param(pipeline_hook_defs, "pipeline_hook_defs", of_type=HookDefinition)

    for mode_def in mode_definitions:
        mode_resources = set(mode_def.resource_defs.keys())
        for solid_def in solid_defs:
            for required_resource in solid_def.required_resource_keys:
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError(
                        (
                            'Resource "{resource}" is required by solid def {solid_def_name}, but is not '
                            'provided by mode "{mode_name}".'
                        ).format(
                            resource=required_resource,
                            solid_def_name=solid_def.name,
                            mode_name=mode_def.name,
                        )
                    )
        for system_storage_def in mode_def.system_storage_defs:
            for required_resource in system_storage_def.required_resource_keys:
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError(
                        (
                            "Resource '{resource}' is required by system storage "
                            "'{storage_name}', but is not provided by mode '{mode_name}'."
                        ).format(
                            resource=required_resource,
                            storage_name=system_storage_def.name,
                            mode_name=mode_def.name,
                        )
                    )
        for solid in solid_dict.values():
            for hook_def in solid.hook_defs:
                for required_resource in hook_def.required_resource_keys:
                    if required_resource not in mode_resources:
                        raise DagsterInvalidDefinitionError(
                            (
                                'Resource "{resource}" is required by hook "{hook_name}", but is not '
                                'provided by mode "{mode_name}".'
                            ).format(
                                resource=required_resource,
                                hook_name=hook_def.name,
                                mode_name=mode_def.name,
                            )
                        )

        for hook_def in pipeline_hook_defs:
            for required_resource in hook_def.required_resource_keys:
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError(
                        (
                            'Resource "{resource}" is required by hook "{hook_name}", but is not '
                            'provided by mode "{mode_name}".'
                        ).format(
                            resource=required_resource,
                            hook_name=hook_def.name,
                            mode_name=mode_def.name,
                        )
                    )
Example #11
0
    def with_hooks(self, hook_defs):
        from .composition import CallableSolidNode

        hook_defs = frozenset(check.set_param(hook_defs, "hook_defs", of_type=HookDefinition))

        return CallableSolidNode(self, hook_defs=hook_defs)
Example #12
0
 def __new__(cls, reg_set, frozen_set):
     set_param(reg_set, 'reg_set')
     inst_param(frozen_set, 'frozen_set', frozenset)
     return super(HasSets, cls).__new__(cls, reg_set, frozen_set)