Exemple #1
0
def dynamic_pipeline():
    @solid
    def multiply_by_two(context, y):
        context.log.info("multiply_by_two is returning " + str(y * 2))
        return y * 2

    @solid
    def multiply_inputs(context, y, ten, should_fail):
        current_run = context.instance.get_run_by_id(context.run_id)
        if should_fail:
            if y == 2 and current_run.parent_run_id is None:
                raise Exception()
        context.log.info("multiply_inputs is returning " + str(y * ten))
        return y * ten

    @solid
    def emit_ten(_):
        return 10

    @solid(output_defs=[DynamicOutputDefinition()])
    def emit(_):
        for i in range(3):
            yield DynamicOutput(value=i, mapping_key=str(i))

    @solid
    def sum_numbers(_, nums):
        return sum(nums)

    # pylint: disable=no-member
    multiply_by_two.alias("double_total")(sum_numbers(
        emit().map(lambda n: multiply_by_two(multiply_inputs(n, emit_ten())),
                   ).collect(), ))
def test_fails_with_wrong_output():
    @solid(output_defs=[DynamicOutputDefinition()])
    def should_fail(_):
        yield Output(1)

    with pytest.raises(DagsterInvariantViolationError,
                       match="must yield DynamicOutput"):
        execute_solid(should_fail)

    @solid(output_defs=[DynamicOutputDefinition()])
    def should_also_fail(_):
        return 1

    with pytest.raises(DagsterInvariantViolationError,
                       match="must yield DynamicOutput"):
        execute_solid(should_also_fail)
Exemple #3
0
def test_dynamic(gcs_bucket):
    @solid(output_defs=[DynamicOutputDefinition()])
    def numbers(_):
        for i in range(3):
            yield DynamicOutput(i, mapping_key=str(i))

    @solid
    def echo(_, x):
        return x

    @pipeline(mode_defs=[
        ModeDefinition(resource_defs={
            "io_manager": gcs_pickle_io_manager,
            "gcs": gcs_resource
        })
    ])
    def dynamic():
        numbers().map(echo)

    result = execute_pipeline(dynamic,
                              run_config={
                                  "resources": {
                                      "io_manager": {
                                          "config": {
                                              "gcs_bucket": gcs_bucket
                                          }
                                      }
                                  }
                              })
    assert result.success
Exemple #4
0
def test_solid_outputs_access():
    called = {}

    @success_hook
    def my_success_hook(context):
        called[context.step_key] = context.solid_output_values

    @failure_hook
    def my_failure_hook(context):
        called[context.step_key] = context.solid_output_values

    @solid(output_defs=[
        OutputDefinition(name="one"),
        OutputDefinition(name="two"),
        OutputDefinition(name="three"),
    ])
    def a_solid(_):
        yield Output(1, "one")
        yield Output(2, "two")
        yield Output(3, "three")

    @solid(output_defs=[
        OutputDefinition(name="one"),
        OutputDefinition(name="two"),
    ])
    def failed_solid(_):
        yield Output(1, "one")
        raise SomeUserException()
        yield Output(3, "two")  # pylint: disable=unreachable

    @solid(output_defs=[DynamicOutputDefinition()])
    def dynamic_solid(_):
        yield DynamicOutput(1, mapping_key="mapping_1")
        yield DynamicOutput(2, mapping_key="mapping_2")

    @solid
    def echo(_, x):
        return x

    @my_success_hook
    @my_failure_hook
    @pipeline
    def a_pipeline():
        a_solid()
        failed_solid()
        dynamic_solid().map(echo)

    result = execute_pipeline(a_pipeline, raise_on_error=False)
    assert not result.success
    assert called.get("a_solid") == {"one": 1, "two": 2, "three": 3}
    assert called.get("failed_solid") == {"one": 1}
    assert called.get("dynamic_solid") == {
        "result": {
            "mapping_1": 1,
            "mapping_2": 2
        }
    }
    assert called.get("echo[mapping_1]") == {"result": 1}
    assert called.get("echo[mapping_2]") == {"result": 2}
def test_fails_dupe_keys():
    @solid(output_defs=[DynamicOutputDefinition()])
    def should_fail(_):
        yield DynamicOutput(True, mapping_key="dunk")
        yield DynamicOutput(True, mapping_key="dunk")

    with pytest.raises(DagsterInvariantViolationError,
                       match='mapping_key "dunk" multiple times'):
        execute_solid(should_fail)
def test_dynamic_output_solid():
    @solid(output_defs=[DynamicOutputDefinition()])
    def should_work(_):
        yield DynamicOutput(1, mapping_key="1")
        yield DynamicOutput(2, mapping_key="2")

    result = execute_in_process(should_work)
    assert result.success
    assert result.output_values["result"]["1"] == 1
    assert result.output_values["result"]["2"] == 2
Exemple #7
0
def test_dynamic_output_definition_single_partition_materialization():

    entry1 = EventMetadataEntry.int(123, "nrows")
    entry2 = EventMetadataEntry.float(3.21, "some value")

    @solid(output_defs=[
        OutputDefinition(name="output1", asset_key=AssetKey("table1"))
    ])
    def solid1(_):
        return Output(None, "output1", metadata_entries=[entry1])

    @solid(output_defs=[
        DynamicOutputDefinition(
            name="output2",
            asset_key=lambda context: AssetKey(context.mapping_key))
    ])
    def solid2(_, _input1):
        for i in range(4):
            yield DynamicOutput(
                7,
                mapping_key=str(i),
                output_name="output2",
                metadata_entries=[entry2],
            )

    @solid
    def do_nothing(_, _input1):
        pass

    @pipeline
    def my_pipeline():
        solid2(solid1()).map(do_nothing)

    result = execute_pipeline(my_pipeline)
    events = result.step_event_list
    materializations = [
        event for event in events
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 5

    check_materialization(materializations[0],
                          AssetKey(["table1"]),
                          metadata_entries=[entry1])
    seen_paths = set()
    for i in range(1, 5):
        path = materializations[i].asset_key.path
        seen_paths.add(tuple(path))
        check_materialization(
            materializations[i],
            AssetKey(path),
            metadata_entries=[entry2],
            parent_assets=[AssetLineageInfo(AssetKey(["table1"]))],
        )
    assert len(seen_paths) == 4
Exemple #8
0
def test_dynamic():
    @solid(output_defs=[DynamicOutputDefinition(dagster_type=int)])
    def dyn_desc(_) -> Iterator[DynamicOutput]:
        """
        Returns:
            numbers
        """
        yield DynamicOutput(4, "4")

    assert dyn_desc.output_defs[0].description == "numbers"
    assert dyn_desc.output_defs[0].is_dynamic
def test_basic():
    @solid(output_defs=[DynamicOutputDefinition()])
    def should_work(_):
        yield DynamicOutput(1, mapping_key="1")
        yield DynamicOutput(2, mapping_key="2")

    result = execute_solid(should_work)

    assert result.success
    assert len(result.get_output_events_for_compute()) == 2
    assert len(result.compute_output_events_dict["result"]) == 2
    assert result.output_values == {"result": {"1": 1, "2": 2}}
    assert result.output_value() == {"1": 1, "2": 2}
Exemple #10
0
def test_must_unpack_composite():
    with pytest.raises(
            DagsterInvalidDefinitionError,
            match="Dynamic output must be unpacked by invoking map",
    ):

        @composite_solid(output_defs=[DynamicOutputDefinition()])
        def composed():
            return dynamic_numbers()

        @pipeline
        def _should_fail():
            echo(composed())
def test_multi_output():
    @solid(output_defs=[
        DynamicOutputDefinition(int, "numbers"),
        DynamicOutputDefinition(str, "letters"),
        OutputDefinition(str, "wildcard"),
    ])
    def should_work(_):
        yield DynamicOutput(1, output_name="numbers", mapping_key="1")
        yield DynamicOutput(2, output_name="numbers", mapping_key="2")
        yield DynamicOutput("a", output_name="letters", mapping_key="a")
        yield DynamicOutput("b", output_name="letters", mapping_key="b")
        yield DynamicOutput("c", output_name="letters", mapping_key="c")
        yield Output("*", "wildcard")

    result = execute_solid(should_work)

    assert result.success
    assert len(result.get_output_events_for_compute("numbers")) == 2
    assert len(result.get_output_events_for_compute("letters")) == 3
    assert result.get_output_event_for_compute("wildcard")
    assert len(result.compute_output_events_dict["numbers"]) == 2
    assert len(result.compute_output_events_dict["letters"]) == 3
    assert len(result.compute_output_events_dict["wildcard"]) == 1
    assert result.output_values == {
        "numbers": {
            "1": 1,
            "2": 2
        },
        "letters": {
            "a": "a",
            "b": "b",
            "c": "c"
        },
        "wildcard": "*",
    }
    assert result.output_value("numbers") == {"1": 1, "2": 2}
    assert result.output_value("letters") == {"a": "a", "b": "b", "c": "c"}
    assert result.output_value("wildcard") == "*"
Exemple #12
0
def test_multi_composite_out():
    with pytest.raises(
            DagsterInvalidDefinitionError,
            match="cannot be downstream of more than one dynamic output",
    ):

        @composite_solid(output_defs=[DynamicOutputDefinition()])
        def composed_echo():
            return dynamic_solid().map(echo)

        @pipeline
        def _should_fail():
            def _complex(item):
                composed_echo().map(lambda y: add(y, item))

            dynamic_solid().map(_complex)
Exemple #13
0
def test_composite_multi_out():
    @composite_solid(
        output_defs=[OutputDefinition(Any, "one"), DynamicOutputDefinition(Any, "numbers")]
    )
    def multi_out():
        one = emit_one()
        numbers = dynamic_numbers()
        return {"one": one, "numbers": numbers}

    @pipeline
    def composite_multi():
        one, numbers = multi_out()
        echo(one)
        numbers.map(echo)

    result = execute_pipeline(composite_multi)
    assert result.success
def test_temp_fail_on_dep():
    # to be removed in upcoming diff

    @solid(output_defs=[DynamicOutputDefinition()])
    def should_work(_):
        yield DynamicOutput(1, mapping_key="1")
        yield DynamicOutput(2, mapping_key="2")

    @solid
    def echo(_, x):
        return x

    with pytest.raises(DagsterInvalidDefinitionError,
                       match="not yet supported"):

        @pipeline
        def _uh_oh():
            echo(should_work())
Exemple #15
0
def test_direct_dep():
    @solid(output_defs=[DynamicOutputDefinition()])
    def dynamic_add(_, x):
        yield DynamicOutput(x + 1, mapping_key="1")
        yield DynamicOutput(x + 2, mapping_key="2")

    @pipeline
    def _is_fine():
        def _add(item):
            dynamic_add(item)

        dynamic_solid().map(_add)

    with pytest.raises(
            DagsterInvalidDefinitionError,
            match="cannot be downstream of more than one dynamic output",
    ):

        @pipeline
        def _should_fail():
            def _add_echo(item):
                dynamic_add(item).map(echo)

            dynamic_solid().map(_add_echo)

    @pipeline
    def _is_fine():
        dynamic_solid().map(dynamic_add)

    with pytest.raises(
            DagsterInvalidDefinitionError,
            match="cannot be downstream of more than one dynamic output",
    ):

        @pipeline
        def _should_fail():
            echo(dynamic_solid().map(dynamic_add).collect())
Exemple #16
0
        )


def fn_save_treated_local(context, df, file_path, mode="staging"):

    _file_path = file_path.format(mode=mode, filetype="csv")
    _file_path = Path(_file_path)
    _file_path.parent.mkdir(parents=True, exist_ok=True)
    _file_path = str(_file_path)
    context.log.info(f"Saving df to {_file_path}")
    df.to_csv(_file_path, index=False)
    return _file_path


@solid(
    output_defs=[DynamicOutputDefinition(dict)],
    retry_policy=RetryPolicy(max_retries=3, delay=30),
)
def get_runs(context, execution_date):
    execution_date = datetime.strptime(execution_date, "%Y-%m-%d")
    now = execution_date + timedelta(hours=11, minutes=30)
    this_time_yesterday = now - timedelta(days=1)
    min_timestamp = convert_datetime_to_unix_time(this_time_yesterday)
    max_timestamp = convert_datetime_to_unix_time(now)
    context.log.info(f"{execution_date} of type {type(execution_date)}")
    ftp_client = connect_ftp(os.getenv("FTPS_HOST"),
                             os.getenv("FTPS_USERNAME"), os.getenv("FTPS_PWD"))

    # Change to working directory
    ftp_client.cwd("/")
    for folder in ftp_client.mlsd():
Exemple #17
0
                    context.log.info("Success!")
                else:
                    context.log.info("View not found, skipping...")
            rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value,
                   materialized_views)
    except Exception as e:
        try:
            materialization_lock.release()
        except:
            pass
        raise e


@solid(
    retry_policy=RetryPolicy(max_retries=3, delay=5),
    output_defs=[DynamicOutputDefinition(dict)],
)
def update_managed_views(
    context,
    blob_names,
    materialization_locked: bool,
    materialization_lock: Redlock,
):
    try:
        # Setup Redis and Redlock
        r = Redis(constants.REDIS_HOST.value)
        rp = RedisPal(constants.REDIS_HOST.value)
        views_lock = Redlock(
            key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value,
            masters=[r],
            auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value,
Exemple #18
0
@solid
def save_blob_to_tempfile(context, blob_path: str, bucket_name: str) -> str:
    tempfile_name: str = f"/tmp/{uuid4()}.zip"
    context.log.debug(
        f"Saving {blob_path} to temporary file with name {tempfile_name}")
    blob: Blob = get_blob(blob_path, bucket_name, mode="staging")
    with open(tempfile_name, "wb") as tempfile:
        tempfile.write(blob.download_as_bytes())
        tempfile.close()
    return tempfile_name


@solid(
    output_defs=[
        DynamicOutputDefinition(name="filename"),
    ], )
def get_gtfs_files(context, original_filepath):
    feed_files = gk.list_feed(original_filepath)['file_name']
    for item in feed_files:
        filename = Path(item).stem
        yield DynamicOutput(filename,
                            mapping_key=filename,
                            output_name='filename')


@solid
def create_gtfs_version_partition(context, feed, original_filepath,
                                  bucket_name):
    # If feed_info.txt is available, use GTFS version as partition
    if feed.feed_info is not None:
Exemple #19
0
from typing import Iterator

from dagster import Any, Field, String, solid
from dagster.core.execution.context.compute import AbstractComputeExecutionContext
from dagster.experimental import DynamicOutput, DynamicOutputDefinition

from hca_orchestration.support.typing import HcaScratchDatasetName, MetadataTypeFanoutResult
from hca_manage.common import JobId


@solid(config_schema={
    "metadata_types": Field(Any, is_required=True),
    "prefix": Field(str, is_required=True)
},
       output_defs=[
           DynamicOutputDefinition(name="table_fanout_result",
                                   dagster_type=MetadataTypeFanoutResult)
       ])
def ingest_metadata_type(
    context: AbstractComputeExecutionContext, result: list[JobId],
    scratch_dataset_name: HcaScratchDatasetName
) -> Iterator[MetadataTypeFanoutResult]:
    """
    For each metadata type, return a dynamic output over which we can later map
    This saves us from hardcoding solids for each type
    """
    for metadata_type in context.solid_config["metadata_types"]:
        yield DynamicOutput(value=MetadataTypeFanoutResult(
            scratch_dataset_name, metadata_type.value,
            context.solid_config["prefix"]),
                            mapping_key=metadata_type.value,
                            output_name="table_fanout_result")
Exemple #20
0
def test_tags_to_dynamic_plan():
    @solid(
        tags={
            USER_DEFINED_K8S_CONFIG_KEY: {
                "container_config": {
                    "resources": {
                        "requests": {
                            "cpu": "500m",
                            "memory": "128Mi"
                        },
                        "limits": {
                            "cpu": "1000m",
                            "memory": "1Gi"
                        },
                    }
                }
            }
        })
    def multiply_inputs(_, x):
        return 2 * x

    @solid(
        tags={
            USER_DEFINED_K8S_CONFIG_KEY: {
                "container_config": {
                    "resources": {
                        "requests": {
                            "cpu": "250m",
                            "memory": "64Mi"
                        },
                        "limits": {
                            "cpu": "500m",
                            "memory": "2560Mi"
                        },
                    }
                }
            }
        },
        output_defs=[DynamicOutputDefinition()],
    )
    def emit(_):
        for i in range(3):
            yield DynamicOutput(value=i, mapping_key=str(i))

    @pipeline
    def k8s_ready():
        return emit().map(multiply_inputs)

    known_state = KnownExecutionState(
        {},
        {
            emit.name: {
                "result": ["0", "1", "2"]
            },
        },
    )
    plan = create_execution_plan(k8s_ready, known_state=known_state)

    emit_step = plan.get_step_by_key(emit.name)
    user_defined_k8s_config = get_user_defined_k8s_config(emit_step.tags)

    assert user_defined_k8s_config.container_config
    assert user_defined_k8s_config.container_config["resources"]

    resources = user_defined_k8s_config.container_config["resources"]

    assert resources["requests"]["cpu"] == "250m"
    assert resources["requests"]["memory"] == "64Mi"
    assert resources["limits"]["cpu"] == "500m"
    assert resources["limits"]["memory"] == "2560Mi"

    for mapping_key in range(3):
        multiply_inputs_step = plan.get_step_by_key(
            f"{multiply_inputs.name}[{mapping_key}]")
        dynamic_step_user_defined_k8s_config = get_user_defined_k8s_config(
            multiply_inputs_step.tags)

        assert dynamic_step_user_defined_k8s_config.container_config
        assert dynamic_step_user_defined_k8s_config.container_config[
            "resources"]

        resources = dynamic_step_user_defined_k8s_config.container_config[
            "resources"]

        assert resources["requests"]["cpu"] == "500m"
        assert resources["requests"]["memory"] == "128Mi"
        assert resources["limits"]["cpu"] == "1000m"
        assert resources["limits"]["memory"] == "1Gi"
Exemple #21
0
@solid
def multiply_inputs(context, y, ten):
    # current_run = context.instance.get_run_by_id(context.run_id)
    # if y == 2 and current_run.parent_run_id is None:
    #     raise Exception()
    context.log.info("multiply_inputs is returning " + str(y * ten))
    return y * ten


@solid
def emit_ten(_):
    return 10


@solid(output_defs=[DynamicOutputDefinition()])
def emit(_):
    for i in range(3):
        yield DynamicOutput(value=i, mapping_key=str(i))


@pipeline
def dynamic_pipeline():
    # pylint: disable=no-member
    emit().map(lambda n: multiply_by_two(multiply_inputs(n, emit_ten())))


def test_map():
    result = execute_pipeline(
        dynamic_pipeline,
    )
# start_marker
import os
from typing import List

from dagster import Field, pipeline, solid
from dagster.experimental import DynamicOutput, DynamicOutputDefinition
from dagster.utils import file_relative_path


@solid(
    config_schema={
        "path": Field(str,
                      default_value=file_relative_path(__file__, "sample"))
    },
    output_defs=[DynamicOutputDefinition(str)],
)
def files_in_directory(context):
    path = context.solid_config["path"]
    dirname, _, filenames = next(os.walk(path))
    for file in filenames:
        yield DynamicOutput(
            value=os.path.join(dirname, file),
            # create a mapping key from the file name
            mapping_key=file.replace(".", "_").replace("-", "_"),
        )


@solid
def process_file(path: str) -> int:
    # simple example of calculating size