Beispiel #1
0
def recon_repo_for_cli_args(kwargs: Dict[str, str]):
    """Builds a ReconstructableRepository for CLI arguments, which can be any of the combinations
    for repo loading above.
    """
    check.dict_param(kwargs, "kwargs")
    _cli_load_invariant(kwargs.get("pipeline_name") is None)

    if kwargs.get("workspace"):
        check.not_implemented(
            "Workspace not supported yet in this cli command")

    elif kwargs.get("module_name") and kwargs.get("fn_name"):
        _cli_load_invariant(kwargs.get("repository_yaml") is None)
        _cli_load_invariant(kwargs.get("python_file") is None)
        return ReconstructableRepository.for_module(
            kwargs["module_name"],
            kwargs["fn_name"],
            get_working_directory_from_kwargs(kwargs),
        )

    elif kwargs.get("python_file") and kwargs.get("fn_name"):
        _cli_load_invariant(kwargs.get("repository_yaml") is None)
        _cli_load_invariant(kwargs.get("module_name") is None)
        return ReconstructableRepository.for_file(
            os.path.abspath(cast(str, kwargs["python_file"])),
            kwargs["fn_name"],
            get_working_directory_from_kwargs(kwargs),
        )
    else:
        _cli_load_invariant(False)
Beispiel #2
0
def test_my_custom_operator(
    dagster_airflow_custom_operator_pipeline,
    caplog,
):  # pylint: disable=redefined-outer-name
    caplog.set_level(logging.INFO, logger="CustomOperatorLogger")
    pipeline_name = "demo_pipeline_s3"
    operator = CustomOperator

    environments_path = get_test_project_environments_path()

    results = dagster_airflow_custom_operator_pipeline(
        pipeline_name=pipeline_name,
        recon_repo=ReconstructableRepository.for_module(
            "dagster_test.test_project.test_pipelines.repo", pipeline_name),
        operator=operator,
        environment_yaml=[
            os.path.join(environments_path, "env.yaml"),
            os.path.join(environments_path, "env_s3.yaml"),
        ],
    )
    validate_pipeline_execution(results)

    log_lines = 0
    for record in caplog.records:
        if record.name == "CustomOperatorLogger":
            log_lines += 1
            assert record.message == "CustomOperator is called"

    assert log_lines == 2
Beispiel #3
0
def test_error_dag_containerized(dagster_docker_image):  # pylint: disable=redefined-outer-name
    pipeline_name = "demo_error_pipeline_s3"
    recon_repo = ReconstructableRepository.for_module(
        "dagster_test.test_project.test_pipelines.repo",
        "define_demo_execution_repo")
    environments_path = get_test_project_environments_path()
    environment_yaml = [
        os.path.join(environments_path, "env_s3.yaml"),
    ]
    run_config = load_yaml_from_glob_list(environment_yaml)

    run_id = make_new_run_id()
    execution_date = timezone.utcnow()

    with postgres_instance() as instance:

        dag, tasks = make_airflow_dag_containerized_for_recon_repo(
            recon_repo,
            pipeline_name,
            dagster_docker_image,
            run_config,
            instance=instance,
            op_kwargs={"network_mode": "container:test-postgres-db-airflow"},
        )

        with pytest.raises(AirflowException) as exc_info:
            execute_tasks_in_dag(dag, tasks, run_id, execution_date)

        assert "Exception: Unusual error" in str(exc_info.value)
Beispiel #4
0
def test_airflow_execution_date_tags_job():
    job_name = "demo_airflow_execution_date_job"
    recon_repo = ReconstructableRepository.for_module(
        "dagster_test.test_project.test_pipelines.repo", job_name
    )
    environments_path = get_test_project_environments_path()
    environment_yaml = [
        os.path.join(environments_path, "env_filesystem.yaml"),
    ]
    run_config = load_yaml_from_glob_list(environment_yaml)
    execution_date = timezone.utcnow()

    dag, tasks = make_airflow_dag_for_recon_repo(recon_repo, job_name, run_config)

    results = execute_tasks_in_dag(
        dag, tasks, run_id=make_new_run_id(), execution_date=execution_date
    )

    materialized_airflow_execution_date = None
    for result in results.values():
        for event in result:
            if event.event_type_value == "ASSET_MATERIALIZATION":
                materialization = event.event_specific_data.materialization
                materialization_entry = materialization.metadata_entries[0]
                materialized_airflow_execution_date = materialization_entry.entry_data.text

    assert execution_date.isoformat() == materialized_airflow_execution_date
Beispiel #5
0
def recon_repository_from_origin(origin):
    check.inst_param(origin, "origin", RepositoryPythonOrigin)
    return ReconstructableRepository(
        origin.code_pointer,
        origin.container_image,
        origin.executable_path,
        origin.entry_point,
        origin.container_context,
    )
Beispiel #6
0
def test_skip_operator(
    dagster_airflow_python_operator_pipeline,
):  # pylint: disable=redefined-outer-name
    pipeline_name = "optional_outputs"
    environments_path = get_test_project_environments_path()
    results = dagster_airflow_python_operator_pipeline(
        pipeline_name=pipeline_name,
        recon_repo=ReconstructableRepository.for_module(
            "dagster_test.test_project.test_pipelines.repo", pipeline_name
        ),
        environment_yaml=[os.path.join(environments_path, "env_filesystem.yaml")],
    )
    validate_skip_pipeline_execution(results)
Beispiel #7
0
    def __init__(self, loadable_target_origin, entry_point):
        self._loadable_target_origin = loadable_target_origin

        self._code_pointers_by_repo_name = {}
        self._recon_repos_by_name = {}
        self._loadable_repository_symbols = []

        if not loadable_target_origin:
            return

        loadable_targets = get_loadable_targets(
            loadable_target_origin.python_file,
            loadable_target_origin.module_name,
            loadable_target_origin.package_name,
            loadable_target_origin.working_directory,
            loadable_target_origin.attribute,
        )
        for loadable_target in loadable_targets:
            pointer = _get_code_pointer(loadable_target_origin,
                                        loadable_target)
            recon_repo = ReconstructableRepository(
                pointer,
                _get_current_image(),
                sys.executable,
                entry_point=entry_point,
            )
            repo_def = recon_repo.get_definition()
            # force load of all lazy constructed jobs/pipelines
            repo_def.get_all_pipelines()

            self._code_pointers_by_repo_name[repo_def.name] = pointer
            self._recon_repos_by_name[repo_def.name] = recon_repo
            self._loadable_repository_symbols.append(
                LoadableRepositorySymbol(
                    attribute=loadable_target.attribute,
                    repository_name=repo_def.name,
                ))
Beispiel #8
0
def test_fs_storage_no_explicit_base_dir(
    dagster_airflow_python_operator_pipeline,
):  # pylint: disable=redefined-outer-name
    pipeline_name = "demo_pipeline"
    environments_path = get_test_project_environments_path()
    results = dagster_airflow_python_operator_pipeline(
        pipeline_name=pipeline_name,
        recon_repo=ReconstructableRepository.for_module(
            "dagster_test.test_project.test_pipelines.repo", pipeline_name
        ),
        environment_yaml=[
            os.path.join(environments_path, "env.yaml"),
        ],
    )
    validate_pipeline_execution(results)
Beispiel #9
0
def test_skip_operator(dagster_airflow_docker_operator_pipeline,
                       dagster_docker_image):  # pylint: disable=redefined-outer-name
    pipeline_name = "optional_outputs"
    environments_path = get_test_project_environments_path()
    results = dagster_airflow_docker_operator_pipeline(
        pipeline_name=pipeline_name,
        recon_repo=ReconstructableRepository.for_module(
            "dagster_test.test_project.test_pipelines.repo",
            "define_demo_execution_repo",
        ),
        environment_yaml=[
            os.path.join(environments_path, "env_filesystem.yaml")
        ],
        op_kwargs={"host_tmp_dir": "/tmp"},
        image=dagster_docker_image,
    )
    validate_skip_pipeline_execution(results)
Beispiel #10
0
def test_s3_storage(dagster_airflow_docker_operator_pipeline,
                    dagster_docker_image):  # pylint: disable=redefined-outer-name
    pipeline_name = "demo_pipeline_s3"
    environments_path = get_test_project_environments_path()
    results = dagster_airflow_docker_operator_pipeline(
        pipeline_name=pipeline_name,
        recon_repo=ReconstructableRepository.for_module(
            "dagster_test.test_project.test_pipelines.repo",
            "define_demo_execution_repo",
        ),
        environment_yaml=[
            os.path.join(environments_path, "env.yaml"),
            os.path.join(environments_path, "env_s3.yaml"),
        ],
        image=dagster_docker_image,
    )
    validate_pipeline_execution(results)
Beispiel #11
0
def test_error_dag_python_job():
    job_name = "demo_error_job"
    recon_repo = ReconstructableRepository.for_module(
        "dagster_test.test_project.test_pipelines.repo", job_name
    )
    environments_path = get_test_project_environments_path()
    environment_yaml = [
        os.path.join(environments_path, "env_filesystem.yaml"),
    ]
    run_config = load_yaml_from_glob_list(environment_yaml)
    execution_date = timezone.utcnow()

    dag, tasks = make_airflow_dag_for_recon_repo(recon_repo, job_name, run_config)

    with pytest.raises(AirflowException) as exc_info:
        execute_tasks_in_dag(dag, tasks, run_id=make_new_run_id(), execution_date=execution_date)

    assert "Exception: Unusual error" in str(exc_info.value)
Beispiel #12
0
def test_airflow_execution_date_tags_containerized(dagster_docker_image, ):  # pylint: disable=redefined-outer-name, unused-argument
    pipeline_name = "demo_airflow_execution_date_pipeline_s3"
    recon_repo = ReconstructableRepository.for_module(
        "dagster_test.test_project.test_pipelines.repo",
        "define_demo_execution_repo")
    environments_path = get_test_project_environments_path()
    environment_yaml = [
        os.path.join(environments_path, "env_s3.yaml"),
    ]
    run_config = load_yaml_from_glob_list(environment_yaml)

    execution_date = timezone.utcnow()

    with postgres_instance() as instance:
        dag, tasks = make_airflow_dag_containerized_for_recon_repo(
            recon_repo,
            pipeline_name,
            dagster_docker_image,
            run_config,
            instance=instance,
            op_kwargs={"network_mode": "container:test-postgres-db-airflow"},
        )

        results = execute_tasks_in_dag(dag,
                                       tasks,
                                       run_id=make_new_run_id(),
                                       execution_date=execution_date)

        materialized_airflow_execution_date = None
        for result in results.values():
            for event in result:
                if event.event_type_value == "ASSET_MATERIALIZATION":
                    materialization = event.event_specific_data.materialization
                    materialization_entry = materialization.metadata_entries[0]
                    materialized_airflow_execution_date = materialization_entry.entry_data.text

        assert execution_date.isoformat(
        ) == materialized_airflow_execution_date
    def test_run_finished(self, graphql_context):
        instance = graphql_context.instance

        pipeline = ReconstructableRepository.for_file(
            file_relative_path(__file__, "setup.py"),
            "test_repo",
        ).get_reconstructable_pipeline("noop_pipeline")

        pipeline_result = execute_pipeline(pipeline, instance=instance)
        assert pipeline_result.success
        assert pipeline_result.run_id

        time.sleep(0.05)  # guarantee execution finish

        result = execute_dagster_graphql(
            graphql_context,
            RUN_CANCELLATION_QUERY,
            variables={"runId": pipeline_result.run_id})

        assert result.data["terminatePipelineExecution"][
            "__typename"] == "TerminateRunFailure"
        assert ("could not be terminated due to having status SUCCESS."
                in result.data["terminatePipelineExecution"]["message"])

        # Still fails even if you change the terminate policy to fail immediately
        result = execute_dagster_graphql(
            graphql_context,
            RUN_CANCELLATION_QUERY,
            variables={
                "runId": pipeline_result.run_id,
                "terminatePolicy": "MARK_AS_CANCELED_IMMEDIATELY",
            },
        )

        assert result.data["terminatePipelineExecution"][
            "__typename"] == "TerminateRunFailure"
        assert ("could not be terminated due to having status SUCCESS."
                in result.data["terminatePipelineExecution"]["message"])
Beispiel #14
0
def step_context_to_step_run_ref(
    step_context: StepExecutionContext,
    prior_attempts_count: int,
    package_dir: Optional[str] = None,
) -> StepRunRef:
    """
    Args:
        step_context (StepExecutionContext): The step context.
        prior_attempts_count (int): The number of times this time has been tried before in the same
            pipeline run.
        package_dir (Optional[str]): If set, the reconstruction file code pointer will be converted
            to be relative a module pointer relative to the package root.  This enables executing
            steps in remote setups where the package containing the pipeline resides at a different
            location on the filesystem in the remote environment than in the environment executing
            the plan process.

    Returns (StepRunRef):
        A reference to the step.
    """

    check.inst_param(step_context, "step_context", StepExecutionContext)
    check.int_param(prior_attempts_count, "prior_attempts_count")

    retry_mode = step_context.retry_mode

    recon_pipeline = step_context.pipeline
    if package_dir:
        if isinstance(recon_pipeline, ReconstructablePipeline) and isinstance(
            recon_pipeline.repository.pointer, FileCodePointer
        ):
            recon_pipeline = ReconstructablePipeline(
                repository=ReconstructableRepository(
                    pointer=ModuleCodePointer(
                        _module_in_package_dir(
                            recon_pipeline.repository.pointer.python_file, package_dir
                        ),
                        recon_pipeline.repository.pointer.fn_name,
                        working_directory=os.getcwd(),
                    ),
                    container_image=recon_pipeline.repository.container_image,
                    executable_path=recon_pipeline.repository.executable_path,
                    entry_point=recon_pipeline.repository.entry_point,
                    container_context=recon_pipeline.repository.container_context,
                ),
                pipeline_name=recon_pipeline.pipeline_name,
                solids_to_execute=recon_pipeline.solids_to_execute,
            )

    upstream_output_events, run_group = _upstream_events_and_runs(step_context)
    return StepRunRef(
        run_config=step_context.run_config,
        pipeline_run=step_context.pipeline_run,
        run_id=step_context.pipeline_run.run_id,
        step_key=step_context.step.key,
        retry_mode=retry_mode,
        recon_pipeline=recon_pipeline,  # type: ignore
        prior_attempts_count=prior_attempts_count,
        known_state=step_context.execution_plan.known_state,
        run_group=run_group,
        upstream_output_events=upstream_output_events,
    )
Beispiel #15
0
def create_main_recon_repo():
    return ReconstructableRepository.for_file(__file__, main_repo_name())
Beispiel #16
0
def make_airflow_dag(
    module_name,
    job_name,
    run_config=None,
    mode=None,
    instance=None,
    dag_id=None,
    dag_description=None,
    dag_kwargs=None,
    op_kwargs=None,
    pipeline_name=None,
):
    """Construct an Airflow DAG corresponding to a given Dagster job/pipeline.

    Tasks in the resulting DAG will execute the Dagster logic they encapsulate as a Python
    callable, run by an underlying :py:class:`PythonOperator <airflow:PythonOperator>`. As a
    consequence, both dagster, any Python dependencies required by your solid logic, and the module
    containing your pipeline definition must be available in the Python environment within which
    your Airflow tasks execute. If you cannot install requirements into this environment, or you
    are looking for a containerized solution to provide better isolation, see instead
    :py:func:`make_airflow_dag_containerized`.

    This function should be invoked in an Airflow DAG definition file, such as that created by an
    invocation of the dagster-airflow scaffold CLI tool.

    Args:
        module_name (str): The name of the importable module in which the pipeline/job definition can be
            found.
        job_name (str): The name of the job definition.
        run_config (Optional[dict]): The config, if any, with which to compile
            the pipeline/job to an execution plan, as a Python dict.
        mode (Optional[str]): The mode in which to execute the pipeline.
        instance (Optional[DagsterInstance]): The Dagster instance to use to execute the pipeline/job.
        dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to
            :py:class:`DAG <airflow:airflow.models.DAG>`).
        dag_description (Optional[str]): The description to use for the compiled Airflow DAG
            (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`)
        dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow
            :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``.
        op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow
            operator (a subclass of
            :py:class:`PythonOperator <airflow:airflow.operators.python_operator.PythonOperator>`).
        pipeline_name (str): (legacy) The name of the pipeline definition.

    Returns:
        (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a
        list of its constituent tasks.

    """
    check.str_param(module_name, "module_name")
    job_name = canonicalize_backcompat_args(
        new_val=job_name,
        new_arg="job_name",
        old_val=pipeline_name,
        old_arg="pipeline_name",
        breaking_version="future versions",
        coerce_old_to_new=lambda val: val,
    )

    recon_repo = ReconstructableRepository.for_module(module_name, job_name,
                                                      os.getcwd())
    return _make_airflow_dag(
        recon_repo=recon_repo,
        job_name=job_name,
        run_config=run_config,
        mode=mode,
        instance=instance,
        dag_id=dag_id,
        dag_description=dag_description,
        dag_kwargs=dag_kwargs,
        op_kwargs=op_kwargs,
    )
Beispiel #17
0
def make_airflow_dag_containerized(
    module_name,
    job_name,
    image,
    run_config=None,
    mode=None,
    dag_id=None,
    dag_description=None,
    dag_kwargs=None,
    op_kwargs=None,
    pipeline_name=None,
):
    """Construct a containerized Airflow DAG corresponding to a given Dagster job/pipeline.

    Tasks in the resulting DAG will execute the Dagster logic they encapsulate  using a subclass of
    :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`. As a
    consequence, both dagster, any Python dependencies required by your solid logic, and the module
    containing your pipeline definition must be available in the container spun up by this operator.
    Typically you'll want to install these requirements onto the image you're using.

    This function should be invoked in an Airflow DAG definition file, such as that created by an
    invocation of the dagster-airflow scaffold CLI tool.

    Args:
        module_name (str): The name of the importable module in which the pipeline/job definition can be
            found.
        job_name (str): The name of the job definition.
        image (str): The name of the Docker image to use for execution (passed through to
            :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`).
        run_config (Optional[dict]): The config, if any, with which to compile
            the pipeline/job to an execution plan, as a Python dict.
        mode (Optional[str]): The mode in which to execute the pipeline.
        dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to
            :py:class:`DAG <airflow:airflow.models.DAG>`).
        dag_description (Optional[str]): The description to use for the compiled Airflow DAG
            (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`)
        dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow
            :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``.
        op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow
            operator (a subclass of
            :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`).
        pipeline_name (str): (legacy) The name of the pipeline definition.

    Returns:
        (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a
        list of its constituent tasks.
    """
    check.str_param(module_name, "module_name")
    check.str_param(job_name, "job_name")
    check.str_param(image, "image")
    check.opt_dict_param(run_config, "run_config")
    check.opt_str_param(mode, "mode")
    check.opt_str_param(dag_id, "dag_id")
    check.opt_str_param(dag_description, "dag_description")
    check.opt_dict_param(dag_kwargs, "dag_kwargs")
    check.opt_dict_param(op_kwargs, "op_kwargs")

    job_name = canonicalize_backcompat_args(
        new_val=job_name,
        new_arg="job_name",
        old_val=pipeline_name,
        old_arg="pipeline_name",
        breaking_version="future versions",
        coerce_old_to_new=lambda val: val,
    )
    recon_repo = ReconstructableRepository.for_module(module_name, job_name,
                                                      os.getcwd())

    op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str)
    op_kwargs["image"] = image

    return _make_airflow_dag(
        recon_repo=recon_repo,
        job_name=job_name,
        run_config=run_config,
        mode=mode,
        dag_id=dag_id,
        dag_description=dag_description,
        dag_kwargs=dag_kwargs,
        op_kwargs=op_kwargs,
        operator=DagsterDockerOperator,
    )
    return nonce_solid()


@repository
def my_repository():
    return [nonce_pipeline]


nonce_pipeline_snapshot = nonce_pipeline.get_pipeline_snapshot()

nonce_execution_plan_snapshot = snapshot_from_execution_plan(
    create_execution_plan(nonce_pipeline), nonce_pipeline.get_pipeline_snapshot_id()
)

recon_repo_for_tests = ReconstructableRepository.for_file(
    file_relative_path(__file__, "test_dagster_docker_operator.py"),
    "my_repository",
)


def test_init_modified_docker_operator(dagster_docker_image):
    with instance_for_test() as instance:
        dagster_operator_parameters = DagsterOperatorParameters(
            task_id="nonce",
            pipeline_name="nonce_pipeline",
            mode="default",
            op_kwargs={
                "image": dagster_docker_image,
                "api_version": "auto",
            },
            pipeline_snapshot=nonce_pipeline_snapshot,
            execution_plan_snapshot=nonce_execution_plan_snapshot,