Beispiel #1
0
 def __new__(
     cls,
     handle: Union[StepHandle, ResolvedFromDynamicStepHandle],
     pipeline_name: str,
     step_inputs: List[StepInput],
     step_outputs: List[StepOutput],
     tags: Optional[Dict[str, str]],
     logging_tags: Optional[Dict[str, str]] = None,
 ):
     return super(ExecutionStep, cls).__new__(
         cls,
         handle=check.inst_param(
             handle, "handle", (StepHandle, ResolvedFromDynamicStepHandle)),
         pipeline_name=check.str_param(pipeline_name, "pipeline_name"),
         step_input_dict={
             si.name: si
             for si in check.list_param(
                 step_inputs, "step_inputs", of_type=StepInput)
         },
         step_output_dict={
             so.name: so
             for so in check.list_param(
                 step_outputs, "step_outputs", of_type=StepOutput)
         },
         tags=validate_tags(check.opt_dict_param(tags, "tags",
                                                 key_type=str)),
         logging_tags=merge_dicts(
             {
                 "step_key": handle.to_key(),
                 "pipeline": pipeline_name,
                 "solid": handle.solid_handle.name,
             },
             check.opt_dict_param(logging_tags, "logging_tags"),
         ),
     )
Beispiel #2
0
def make_dagster_pipeline_from_airflow_dag(dag, tags=None):
    '''Construct a Dagster pipeline corresponding to a given Airflow DAG.

    Tasks in the resulting pipeline will execute the execute() method on the corresponding Airflow
    Operator. Dagster, any dependencies required by Airflow Operators, and the module
    containing your DAG definition must be available in the Python environment within which
    your Dagster solids execute.

    To set Airflow's `execution_date` for use with Airflow Operator's execute() methods, either
        (1) (Best for ad hoc runs) Run Pipeline with 'default' preset, which sets execution_date to
        the time (in UTC) of pipeline invocation

        ```
        execute_pipeline(
            pipeline=make_dagster_pipeline_from_airflow_dag(dag),
            preset='default')
        ```

        (2) Add {'airflow_execution_date': utc_date_string} to the PipelineDefinition tags. This
        will override behavior from (1).

        ```
        execute_pipeline(
            make_dagster_pipeline_from_airflow_dag(
                dag,
                {'airflow_execution_date': utc_execution_date_str}
            )
        )
        ```

        (3) (Recommended) Add {'airflow_execution_date': utc_date_string} to the PipelineRun tags,
        such as in the Dagit UI. This will override behavior from (1) and (2)

    Args:
        dag (DAG): The Airflow DAG to compile into a Dagster pipeline
        tags (Dict[str, Field]): Pipeline tags. Optionally include
            `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within
            execution of Airflow Operators.

    Returns:
        pipeline_def (PipelineDefinition): The generated Dagster pipeline

    '''
    check.inst_param(dag, 'dag', DAG)
    tags = check.opt_dict_param(tags, 'tags')

    if IS_AIRFLOW_INGEST_PIPELINE_STR not in tags:
        tags[IS_AIRFLOW_INGEST_PIPELINE_STR] = 'true'

    tags = validate_tags(tags)

    pipeline_dependencies, solid_defs = _get_pipeline_definition_args(dag)
    pipeline_def = PipelineDefinition(
        name='airflow_' + dag.dag_id,
        solid_defs=solid_defs,
        dependencies=pipeline_dependencies,
        tags=tags,
    )
    return pipeline_def
Beispiel #3
0
def test_valid_job_format_with_resources(run_launcher):
    docker_image = test_project_docker_image()

    run_config = load_yaml_from_path(
        os.path.join(test_project_environments_path(), 'env.yaml'))
    pipeline_name = 'demo_pipeline'
    run = PipelineRun(pipeline_name=pipeline_name, run_config=run_config)

    tags = validate_tags({
        K8S_RESOURCE_REQUIREMENTS_KEY: ({
            'requests': {
                'cpu': '250m',
                'memory': '64Mi'
            },
            'limits': {
                'cpu': '500m',
                'memory': '2560Mi'
            },
        })
    })
    resources = get_k8s_resource_requirements(tags)
    job_name = 'dagster-run-%s' % run.run_id
    pod_name = 'dagster-run-%s' % run.run_id
    job = construct_dagster_k8s_job(
        job_config=run_launcher.job_config,
        command=['dagster-graphql'],
        args=[
            '-p',
            'executeRunInProcess',
            '-v',
            seven.json.dumps({'runId': run.run_id}),
        ],
        job_name=job_name,
        resources=resources,
        pod_name=pod_name,
        component='runmaster',
    )

    assert (yaml.dump(
        remove_none_recursively(job.to_dict()),
        default_flow_style=False).strip() == EXPECTED_JOB_SPEC.format(
            run_id=run.run_id,
            job_image=docker_image,
            image_pull_policy=image_pull_policy(),
            dagster_version=dagster_version,
            resources='''
        resources:
          limits:
            cpu: 500m
            memory: 2560Mi
          requests:
            cpu: 250m
            memory: 64Mi''',
        ).strip())
Beispiel #4
0
def test_valid_job_format_with_backcompat_resources(run_launcher):
    docker_image = test_project_docker_image()

    run_config = load_yaml_from_path(
        os.path.join(test_project_environments_path(), "env.yaml"))
    pipeline_name = "demo_pipeline"
    run = PipelineRun(pipeline_name=pipeline_name, run_config=run_config)

    tags = validate_tags({
        K8S_RESOURCE_REQUIREMENTS_KEY: ({
            "requests": {
                "cpu": "250m",
                "memory": "64Mi"
            },
            "limits": {
                "cpu": "500m",
                "memory": "2560Mi"
            },
        })
    })
    user_defined_k8s_config = get_user_defined_k8s_config(tags)
    job_name = "dagster-run-%s" % run.run_id
    pod_name = "dagster-run-%s" % run.run_id
    job = construct_dagster_k8s_job(
        job_config=run_launcher.job_config,
        command=["dagster-graphql"],
        args=[
            "-p",
            "executeRunInProcess",
            "-v",
            seven.json.dumps({"runId": run.run_id}),
        ],
        job_name=job_name,
        user_defined_k8s_config=user_defined_k8s_config,
        pod_name=pod_name,
        component="run_coordinator",
    )

    assert (yaml.dump(
        remove_none_recursively(job.to_dict()),
        default_flow_style=False).strip() == EXPECTED_JOB_SPEC.format(
            run_id=run.run_id,
            job_image=docker_image,
            image_pull_policy=image_pull_policy(),
            dagster_version=dagster_version,
            resources="""
        resources:
          limits:
            cpu: 500m
            memory: 2560Mi
          requests:
            cpu: 250m
            memory: 64Mi""",
        ).strip())
Beispiel #5
0
def test_valid_job_format_with_user_defined_k8s_config(run_launcher):
    docker_image = test_project_docker_image()

    run_config = load_yaml_from_path(
        os.path.join(test_project_environments_path(), "env.yaml"))
    pipeline_name = "demo_pipeline"
    run = PipelineRun(pipeline_name=pipeline_name, run_config=run_config)

    tags = validate_tags({
        USER_DEFINED_K8S_CONFIG_KEY: ({
            "container_config": {
                "resources": {
                    "requests": {
                        "cpu": "250m",
                        "memory": "64Mi"
                    },
                    "limits": {
                        "cpu": "500m",
                        "memory": "2560Mi"
                    },
                }
            },
            "pod_template_spec_metadata": {
                "annotations": {
                    "cluster-autoscaler.kubernetes.io/safe-to-evict": "true"
                },
                "labels": {
                    "spotinst.io/restrict-scale-down": "true"
                },
            },
            "pod_spec_config": {
                "affinity": {
                    "nodeAffinity": {
                        "requiredDuringSchedulingIgnoredDuringExecution": {
                            "nodeSelectorTerms": [{
                                "matchExpressions": [{
                                    "key":
                                    "kubernetes.io/e2e-az-name",
                                    "operator":
                                    "In",
                                    "values": ["e2e-az1", "e2e-az2"],
                                }]
                            }]
                        }
                    }
                }
            },
        })
    })
    user_defined_k8s_config = get_user_defined_k8s_config(tags)
    job_name = "dagster-run-%s" % run.run_id
    pod_name = "dagster-run-%s" % run.run_id
    job = construct_dagster_k8s_job(
        job_config=run_launcher.job_config,
        command=["dagster"],
        args=["api", "execute_run_with_structured_logs"],
        job_name=job_name,
        user_defined_k8s_config=user_defined_k8s_config,
        pod_name=pod_name,
        component="run_coordinator",
    )

    assert (yaml.dump(remove_none_recursively(job.to_dict()),
                      default_flow_style=False).strip() ==
            EXPECTED_CONFIGURED_JOB_SPEC.format(
                run_id=run.run_id,
                job_image=docker_image,
                image_pull_policy=image_pull_policy(),
                dagster_version=dagster_version,
                labels="spotinst.io/restrict-scale-down: 'true'",
                resources="""
        resources:
          limits:
            cpu: 500m
            memory: 2560Mi
          requests:
            cpu: 250m
            memory: 64Mi""",
                annotations="""annotations:
        cluster-autoscaler.kubernetes.io/safe-to-evict: \'true\'""",
                affinity="""affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: kubernetes.io/e2e-az-name
                operator: In
                values:
                - e2e-az1
                - e2e-az2""",
            ).strip())
Beispiel #6
0
    def submit_pipeline_execution(
        self,
        pipeline_name: str,
        repository_location_name: Optional[str] = None,
        repository_name: Optional[str] = None,
        run_config: Optional[Any] = None,
        mode: Optional[str] = None,
        preset: Optional[str] = None,
        tags: Optional[Dict[str, Any]] = None,
    ) -> str:
        """Submits a Pipeline with attached configuration for execution.

        Args:
            pipeline_name (str): The pipeline's name
            repository_location_name (Optional[str], optional): The name of the repository location where
                the pipeline is located. If omitted, the client will try to infer the repository location
                from the available options on the Dagster deployment. Defaults to None.
            repository_name (Optional[str], optional): The name of the repository where the pipeline is located.
                If omitted, the client will try to infer the repository from the available options
                on the Dagster deployment. Defaults to None.
            run_config (Optional[Any], optional): This is the run config to execute the pipeline with.
                Note that runConfigData is any-typed in the GraphQL type system. This type is used when passing in
                an arbitrary object for run config. However, it must conform to the constraints of the config
                schema for this pipeline. If it does not, the client will throw a DagsterGraphQLClientError with a message of
                PipelineConfigValidationInvalid. Defaults to None.
            mode (Optional[str], optional): The mode to run the pipeline with. If you have not
                defined any custom modes for your pipeline, the default mode is "default". Defaults to None.
            preset (Optional[str], optional): The name of a pre-defined preset to use instead of a
                run config. Defaults to None.
            tags (Optional[Dict[str, Any]], optional): A set of tags to add to the pipeline execution.

        Raises:
            DagsterGraphQLClientError("InvalidStepError", invalid_step_key): the pipeline has an invalid step
            DagsterGraphQLClientError("InvalidOutputError", body=error_object): some solid has an invalid output within the pipeline.
                The error_object is of type dagster_graphql.InvalidOutputErrorInfo.
            DagsterGraphQLClientError("ConflictingExecutionParamsError", invalid_step_key): a preset and a run_config & mode are present
                that conflict with one another
            DagsterGraphQLClientError("PresetNotFoundError", message): if the provided preset name is not found
            DagsterGraphQLClientError("PipelineRunConflict", message): a `DagsterRunConflict` occured during execution.
                This indicates that a conflicting pipeline run already exists in run storage.
            DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key): the run_config is not in the expected format
                for the pipeline
            DagsterGraphQLClientError("PipelineNotFoundError", message): the requested pipeline does not exist
            DagsterGraphQLClientError("PythonError", message): an internal framework error occurred

        Returns:
            str: run id of the submitted pipeline run
        """
        check.opt_str_param(repository_location_name,
                            "repository_location_name")
        check.opt_str_param(repository_name, "repository_name")
        check.str_param(pipeline_name, "pipeline_name")
        check.opt_str_param(mode, "mode")
        check.opt_str_param(preset, "preset")
        check.invariant(
            (mode is not None and run_config is not None)
            or preset is not None,
            "Either a mode and run_config or a preset must be specified in order to "
            f"submit the pipeline {pipeline_name} for execution",
        )
        tags = validate_tags(tags)

        if not repository_location_name or not repository_name:
            pipeline_info_lst = self._get_repo_locations_and_names_with_pipeline(
                pipeline_name)
            if len(pipeline_info_lst) == 0:
                raise DagsterGraphQLClientError(
                    "PipelineNotFoundError",
                    f"No pipelines with the name `{pipeline_name}` exist")
            elif len(pipeline_info_lst) == 1:
                pipeline_info = pipeline_info_lst[0]
                repository_location_name = pipeline_info.repository_location_name
                repository_name = pipeline_info.repository_name
            else:
                raise DagsterGraphQLClientError(
                    "Must specify repository_location_name and repository_name"
                    f" since there are multiple pipelines with the name {pipeline_name}."
                    f"\n\tchoose one of: {pipeline_info_lst}")

        variables = {
            "executionParams": {
                "selector": {
                    "repositoryLocationName": repository_location_name,
                    "repositoryName": repository_name,
                    "pipelineName": pipeline_name,
                }
            }
        }
        if preset is not None:
            variables["executionParams"]["preset"] = preset
        if mode is not None and run_config is not None:
            variables["executionParams"] = {
                **variables["executionParams"],
                "runConfigData": run_config,
                "mode": mode,
                "executionMetadata": {
                    "tags": [{
                        "key": k,
                        "value": v
                    } for k, v in tags.items()]
                } if tags else {},
            }

        res_data: Dict[str, Any] = self._execute(
            CLIENT_SUBMIT_PIPELINE_RUN_MUTATION, variables)
        query_result = res_data["launchPipelineExecution"]
        query_result_type = query_result["__typename"]
        if query_result_type == "LaunchPipelineRunSuccess":
            return query_result["run"]["runId"]
        elif query_result_type == "InvalidStepError":
            raise DagsterGraphQLClientError(query_result_type,
                                            query_result["invalidStepKey"])
        elif query_result_type == "InvalidOutputError":
            error_info = InvalidOutputErrorInfo(
                step_key=query_result["stepKey"],
                invalid_output_name=query_result["invalidOutputName"],
            )
            raise DagsterGraphQLClientError(query_result_type, body=error_info)
        elif query_result_type == "PipelineConfigValidationInvalid":
            raise DagsterGraphQLClientError(query_result_type,
                                            query_result["errors"])
        else:
            # query_result_type is a ConflictingExecutionParamsError, a PresetNotFoundError
            # a PipelineNotFoundError, a PipelineRunConflict, or a PythonError
            raise DagsterGraphQLClientError(query_result_type,
                                            query_result["message"])
Beispiel #7
0
def define_dagstermill_solid(
    name,
    notebook_path,
    input_defs=None,
    output_defs=None,
    config_schema=None,
    required_resource_keys=None,
    output_notebook=None,
    asset_key_prefix=None,
    description=None,
    tags=None,
):
    """Wrap a Jupyter notebook in a solid.

    Arguments:
        name (str): The name of the solid.
        notebook_path (str): Path to the backing notebook.
        input_defs (Optional[List[InputDefinition]]): The solid's inputs.
        output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should
            call :py:func:`~dagstermill.yield_result` to yield each of these outputs.
        required_resource_keys (Optional[Set[str]]): The string names of any required resources.
        output_notebook (Optional[str]): If set, will be used as the name of an injected output of
            type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in
            addition to the :py:class:`~dagster.AssetMaterialization` that is always created). This
            respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on
            the pipeline resources via the "file_manager" resource key, so, e.g.,
            if :py:class:`~dagster_aws.s3.s3_file_manager` is configured, the output will be a :
            py:class:`~dagster_aws.s3.S3FileHandle`.
        asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the
            asset keys for materialized notebooks.
        description (Optional[str]): If set, description used for solid.
        tags (Optional[Dict[str, str]]): If set, additional tags used to annotate solid.
            Dagster uses the tag keys `notebook_path` and `kind`, which cannot be
            overwritten by the user.

    Returns:
        :py:class:`~dagster.SolidDefinition`
    """
    check.str_param(name, "name")
    check.str_param(notebook_path, "notebook_path")
    input_defs = check.opt_list_param(input_defs,
                                      "input_defs",
                                      of_type=InputDefinition)
    output_defs = check.opt_list_param(output_defs,
                                       "output_defs",
                                       of_type=OutputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 "required_resource_keys",
                                                 of_type=str)
    if output_notebook is not None:
        required_resource_keys.add("file_manager")
    if isinstance(asset_key_prefix, str):
        asset_key_prefix = [asset_key_prefix]

    asset_key_prefix = check.opt_list_param(asset_key_prefix,
                                            "asset_key_prefix",
                                            of_type=str)

    default_description = f"This solid is backed by the notebook at {notebook_path}"
    description = check.opt_str_param(description,
                                      "description",
                                      default=default_description)

    user_tags = validate_tags(tags)
    if tags is not None:
        check.invariant(
            "notebook_path" not in tags,
            "user-defined solid tags contains the `notebook_path` key, but the `notebook_path` key is reserved for use by Dagster",
        )
        check.invariant(
            "kind" not in tags,
            "user-defined solid tags contains the `kind` key, but the `kind` key is reserved for use by Dagster",
        )
    default_tags = {"notebook_path": notebook_path, "kind": "ipynb"}

    return SolidDefinition(
        name=name,
        input_defs=input_defs,
        compute_fn=_dm_solid_compute(name,
                                     notebook_path,
                                     output_notebook,
                                     asset_key_prefix=asset_key_prefix),
        output_defs=output_defs +
        ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)]
         if output_notebook else []),
        config_schema=config_schema,
        required_resource_keys=required_resource_keys,
        description=description,
        tags={
            **user_tags,
            **default_tags
        },
    )
Beispiel #8
0
def test_valid_job_format_with_user_defined_k8s_config(run_launcher):
    docker_image = test_project_docker_image()

    run_config = load_yaml_from_path(
        os.path.join(test_project_environments_path(), 'env.yaml'))
    pipeline_name = 'demo_pipeline'
    run = PipelineRun(pipeline_name=pipeline_name, run_config=run_config)

    tags = validate_tags({
        USER_DEFINED_K8S_CONFIG_KEY: ({
            'container_config': {
                'resources': {
                    'requests': {
                        'cpu': '250m',
                        'memory': '64Mi'
                    },
                    'limits': {
                        'cpu': '500m',
                        'memory': '2560Mi'
                    },
                }
            },
            'pod_template_spec_metadata': {
                'annotations': {
                    "cluster-autoscaler.kubernetes.io/safe-to-evict": "true"
                }
            },
            'pod_spec_config': {
                'affinity': {
                    'nodeAffinity': {
                        'requiredDuringSchedulingIgnoredDuringExecution': {
                            'nodeSelectorTerms': [{
                                'matchExpressions': [{
                                    'key':
                                    'kubernetes.io/e2e-az-name',
                                    'operator':
                                    'In',
                                    'values': ['e2e-az1', 'e2e-az2'],
                                }]
                            }]
                        }
                    }
                }
            },
        })
    })
    user_defined_k8s_config = get_user_defined_k8s_config(tags)
    job_name = 'dagster-run-%s' % run.run_id
    pod_name = 'dagster-run-%s' % run.run_id
    job = construct_dagster_k8s_job(
        job_config=run_launcher.job_config,
        command=['dagster-graphql'],
        args=[
            '-p',
            'executeRunInProcess',
            '-v',
            seven.json.dumps({'runId': run.run_id}),
        ],
        job_name=job_name,
        user_defined_k8s_config=user_defined_k8s_config,
        pod_name=pod_name,
        component='run_coordinator',
    )

    assert (yaml.dump(remove_none_recursively(job.to_dict()),
                      default_flow_style=False).strip() ==
            EXPECTED_CONFIGURED_JOB_SPEC.format(
                run_id=run.run_id,
                job_image=docker_image,
                image_pull_policy=image_pull_policy(),
                dagster_version=dagster_version,
                resources='''
        resources:
          limits:
            cpu: 500m
            memory: 2560Mi
          requests:
            cpu: 250m
            memory: 64Mi''',
                annotations='''annotations:
        cluster-autoscaler.kubernetes.io/safe-to-evict: \'true\'''',
                affinity='''affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: kubernetes.io/e2e-az-name
                operator: In
                values:
                - e2e-az1
                - e2e-az2''',
            ).strip())
def make_dagster_pipeline_from_airflow_dag(dag,
                                           tags=None,
                                           use_airflow_template_context=False,
                                           unique_id=None):
    '''Construct a Dagster pipeline corresponding to a given Airflow DAG.

    Tasks in the resulting pipeline will execute the execute() method on the corresponding Airflow
    Operator. Dagster, any dependencies required by Airflow Operators, and the module
    containing your DAG definition must be available in the Python environment within which
    your Dagster solids execute.

    To set Airflow's `execution_date` for use with Airflow Operator's execute() methods, either
        (1) (Best for ad hoc runs) Run Pipeline with 'default' preset, which sets execution_date to
        the time (in UTC) of pipeline invocation

        ```
        execute_pipeline(
            pipeline=make_dagster_pipeline_from_airflow_dag(dag=dag),
            preset='default')
        ```

        (2) Add {'airflow_execution_date': utc_date_string} to the PipelineDefinition tags. This
        will override behavior from (1).

        ```
        execute_pipeline(
            make_dagster_pipeline_from_airflow_dag(
                dag=dag,
                tags={'airflow_execution_date': utc_execution_date_str}
            )
        )
        ```

        (3) (Recommended) Add {'airflow_execution_date': utc_date_string} to the PipelineRun tags,
        such as in the Dagit UI. This will override behavior from (1) and (2)

    We apply normalized_name() to the dag id and task ids when generating pipeline name and solid
    names to ensure that names conform to Dagster's naming conventions.

    Args:
        dag (DAG): The Airflow DAG to compile into a Dagster pipeline
        tags (Dict[str, Field]): Pipeline tags. Optionally include
            `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within
            execution of Airflow Operators.
        use_airflow_template_context (bool): If True, will call get_template_context() on the
            Airflow TaskInstance model which requires and modifies the DagRun table.
            (default: False)
        unique_id (int): If not None, this id will be postpended to generated solid names. Used by
            framework authors to enforce unique solid names within a repo.

    Returns:
        pipeline_def (PipelineDefinition): The generated Dagster pipeline

    '''
    check.inst_param(dag, 'dag', DAG)
    tags = check.opt_dict_param(tags, 'tags')
    check.bool_param(use_airflow_template_context,
                     'use_airflow_template_context')
    unique_id = check.opt_int_param(unique_id, 'unique_id')

    if IS_AIRFLOW_INGEST_PIPELINE_STR not in tags:
        tags[IS_AIRFLOW_INGEST_PIPELINE_STR] = 'true'

    tags = validate_tags(tags)

    pipeline_dependencies, solid_defs = _get_pipeline_definition_args(
        dag, use_airflow_template_context, unique_id)
    pipeline_def = PipelineDefinition(
        name=normalized_name(dag.dag_id, None),
        solid_defs=solid_defs,
        dependencies=pipeline_dependencies,
        tags=tags,
    )
    return pipeline_def
Beispiel #10
0
def define_dagstermill_op(
    name: str,
    notebook_path: str,
    input_defs: Optional[Sequence[InputDefinition]] = None,
    output_defs: Optional[Sequence[OutputDefinition]] = None,
    config_schema: Optional[Union[Any, Dict[str, Any]]] = None,
    required_resource_keys: Optional[Set[str]] = None,
    output_notebook_name: Optional[str] = None,
    asset_key_prefix: Optional[Union[List[str], str]] = None,
    description: Optional[str] = None,
    tags: Optional[Dict[str, Any]] = None,
):
    """Wrap a Jupyter notebook in a solid.

    Arguments:
        name (str): The name of the solid.
        notebook_path (str): Path to the backing notebook.
        input_defs (Optional[List[InputDefinition]]): The solid's inputs.
        output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should
            call :py:func:`~dagstermill.yield_result` to yield each of these outputs.
        required_resource_keys (Optional[Set[str]]): The string names of any required resources.
        output_notebook_name: (Optional[str]): If set, will be used as the name of an injected output
            of type of :py:class:`~dagster.BufferedIOBase` that is the file object of the executed
            notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always
            created). It allows the downstream solids to access the executed notebook via a file
            object.
        asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the
            asset keys for materialized notebooks.
        description (Optional[str]): If set, description used for solid.
        tags (Optional[Dict[str, str]]): If set, additional tags used to annotate solid.
            Dagster uses the tag keys `notebook_path` and `kind`, which cannot be
            overwritten by the user.

    Returns:
        :py:class:`~dagster.SolidDefinition`
    """
    check.str_param(name, "name")
    check.str_param(notebook_path, "notebook_path")
    input_defs = check.opt_list_param(input_defs,
                                      "input_defs",
                                      of_type=InputDefinition)
    output_defs = check.opt_list_param(output_defs,
                                       "output_defs",
                                       of_type=OutputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 "required_resource_keys",
                                                 of_type=str)

    extra_output_defs = []
    if output_notebook_name is not None:
        required_resource_keys.add("output_notebook_io_manager")
        extra_output_defs.append(
            OutputDefinition(name=output_notebook_name,
                             io_manager_key="output_notebook_io_manager"))

    if isinstance(asset_key_prefix, str):
        asset_key_prefix = [asset_key_prefix]

    asset_key_prefix = check.opt_list_param(asset_key_prefix,
                                            "asset_key_prefix",
                                            of_type=str)

    default_description = f"This op is backed by the notebook at {notebook_path}"
    description = check.opt_str_param(description,
                                      "description",
                                      default=default_description)

    user_tags = validate_tags(tags)
    if tags is not None:
        check.invariant(
            "notebook_path" not in tags,
            "user-defined solid tags contains the `notebook_path` key, but the `notebook_path` key is reserved for use by Dagster",
        )
        check.invariant(
            "kind" not in tags,
            "user-defined solid tags contains the `kind` key, but the `kind` key is reserved for use by Dagster",
        )
    default_tags = {"notebook_path": notebook_path, "kind": "ipynb"}

    return OpDefinition(
        name=name,
        input_defs=input_defs,
        compute_fn=_dm_compute(
            "define_dagstermill_op",
            name,
            notebook_path,
            output_notebook_name,
            asset_key_prefix=asset_key_prefix,
        ),
        output_defs=output_defs + extra_output_defs,
        config_schema=config_schema,
        required_resource_keys=required_resource_keys,
        description=description,
        tags={
            **user_tags,
            **default_tags
        },
    )
Beispiel #11
0
    def _core_submit_execution(
        self,
        pipeline_name: str,
        repository_location_name: Optional[str] = None,
        repository_name: Optional[str] = None,
        run_config: Optional[Any] = None,
        mode: Optional[str] = None,
        preset: Optional[str] = None,
        tags: Optional[Dict[str, Any]] = None,
        solid_selection: Optional[List[str]] = None,
        is_using_job_op_graph_apis: Optional[bool] = False,
    ):
        check.opt_str_param(repository_location_name,
                            "repository_location_name")
        check.opt_str_param(repository_name, "repository_name")
        check.str_param(pipeline_name, "pipeline_name")
        check.opt_str_param(mode, "mode")
        check.opt_str_param(preset, "preset")
        run_config = check.opt_dict_param(run_config, "run_config")

        # The following invariant will never fail when a job is executed
        check.invariant(
            (mode is not None and run_config is not None)
            or preset is not None,
            "Either a mode and run_config or a preset must be specified in order to "
            f"submit the pipeline {pipeline_name} for execution",
        )
        tags = validate_tags(tags)

        pipeline_or_job = "Job" if is_using_job_op_graph_apis else "Pipeline"

        if not repository_location_name or not repository_name:
            pipeline_info_lst = self._get_repo_locations_and_names_with_pipeline(
                pipeline_name)
            if len(pipeline_info_lst) == 0:
                raise DagsterGraphQLClientError(
                    f"{pipeline_or_job}NotFoundError",
                    f"No {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the name `{pipeline_name}` exist",
                )
            elif len(pipeline_info_lst) == 1:
                pipeline_info = pipeline_info_lst[0]
                repository_location_name = pipeline_info.repository_location_name
                repository_name = pipeline_info.repository_name
            else:
                raise DagsterGraphQLClientError(
                    "Must specify repository_location_name and repository_name"
                    f" since there are multiple {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the name {pipeline_name}."
                    f"\n\tchoose one of: {pipeline_info_lst}")

        variables: Dict[str, Any] = {
            "executionParams": {
                "selector": {
                    "repositoryLocationName": repository_location_name,
                    "repositoryName": repository_name,
                    "pipelineName": pipeline_name,
                    "solidSelection": solid_selection,
                }
            }
        }
        if preset is not None:
            variables["executionParams"]["preset"] = preset
        if mode is not None and run_config is not None:
            variables["executionParams"] = {
                **variables["executionParams"],
                "runConfigData": run_config,
                "mode": mode,
                "executionMetadata": {
                    "tags": [{
                        "key": k,
                        "value": v
                    } for k, v in tags.items()]
                } if tags else {},
            }

        res_data: Dict[str, Any] = self._execute(
            CLIENT_SUBMIT_PIPELINE_RUN_MUTATION, variables)
        query_result = res_data["launchPipelineExecution"]
        query_result_type = query_result["__typename"]
        if (query_result_type == "LaunchRunSuccess"
                or query_result_type == "LaunchPipelineRunSuccess"):
            return query_result["run"]["runId"]
        elif query_result_type == "InvalidStepError":
            raise DagsterGraphQLClientError(query_result_type,
                                            query_result["invalidStepKey"])
        elif query_result_type == "InvalidOutputError":
            error_info = InvalidOutputErrorInfo(
                step_key=query_result["stepKey"],
                invalid_output_name=query_result["invalidOutputName"],
            )
            raise DagsterGraphQLClientError(query_result_type, body=error_info)
        elif (query_result_type == "RunConfigValidationInvalid"
              or query_result_type == "PipelineConfigValidationInvalid"):
            raise DagsterGraphQLClientError(query_result_type,
                                            query_result["errors"])
        else:
            # query_result_type is a ConflictingExecutionParamsError, a PresetNotFoundError
            # a PipelineNotFoundError, a RunConflict, or a PythonError
            raise DagsterGraphQLClientError(query_result_type,
                                            query_result["message"])