Example #1
0
def test_materialized_assets():
    instance = DagsterInstance.ephemeral()
    res = execute_pipeline(materialization_pipeline, instance=instance)
    assert res.success
    asset_keys = instance.all_asset_keys()
    assert len(asset_keys) == 1
    assert asset_keys[0] == AssetKey(["dashboards", "analytics_dashboard"])
Example #2
0
 def save(self, key, df):
     path = os.path.join(self.root_dir, key)
     df.to_parquet(path)
     return AssetMaterialization(
         asset_key=AssetKey(["local_metastore", key]),
         metadata_entries=[EventMetadataEntry.path(path, "on_disk")],
     )
Example #3
0
def test_asset_materialization(conn_string):
    event_log_storage = PostgresEventLogStorage.create_clean_storage(
        conn_string)

    asset_key = AssetKey(["path", "to", "asset_one"])

    @solid
    def materialize_one(_):
        yield AssetMaterialization(
            asset_key=asset_key,
            metadata_entries=[
                EventMetadataEntry.text("hello", "text"),
                EventMetadataEntry.json({"hello": "world"}, "json"),
                EventMetadataEntry.float(1.0, "one"),
            ],
        )
        yield Output(1)

    def _solids():
        materialize_one()

    events_one, _ = synthesize_events(_solids)
    for event in events_one:
        event_log_storage.store_event(event)

    assert asset_key in set(event_log_storage.get_all_asset_keys())
    events = event_log_storage.get_asset_events(asset_key)
    assert len(events) == 1
    event = events[0]
    assert isinstance(event, DagsterEventRecord)
    assert event.dagster_event.event_type_value == DagsterEventType.STEP_MATERIALIZATION.value
Example #4
0
def test_multi_asset_with_compute_kind():
    @multi_asset(outs={"o1": Out(asset_key=AssetKey("o1"))},
                 compute_kind="sql")
    def my_asset(arg1):
        return arg1

    assert my_asset.op.tags == {"kind": "sql"}
Example #5
0
def migrate_asset_key_data(event_log_storage, print_fn=lambda _: None):
    """
    Utility method to build an asset key index from the data in existing event log records.
    Takes in event_log_storage, and a print_fn to keep track of progress.
    """
    from dagster.core.storage.event_log.sql_event_log import AssetAwareSqlEventLogStorage
    from .schema import AssetKeyTable, SqlEventLogStorageTable

    if not isinstance(event_log_storage, AssetAwareSqlEventLogStorage):
        return

    query = (db.select([
        SqlEventLogStorageTable.c.asset_key
    ]).where(SqlEventLogStorageTable.c.asset_key != None).group_by(
        SqlEventLogStorageTable.c.asset_key))
    with event_log_storage.connect() as conn:
        print_fn("Querying event logs.")
        to_insert = conn.execute(query).fetchall()
        print_fn("Found {} records to index".format(len(to_insert)))
        for (asset_key, ) in tqdm(to_insert):
            try:
                conn.execute(AssetKeyTable.insert().values(  # pylint: disable=no-value-for-parameter
                    asset_key=AssetKey.from_db_string(asset_key).to_string()))
            except db.exc.IntegrityError:
                # asset key already present
                pass
Example #6
0
def test_backcompat_get_asset_records():
    src_dir = file_relative_path(
        __file__, "compat_tests/snapshot_0_11_0_asset_materialization")

    # should contain materialization events for asset keys a, b, c, d, e, f
    # events a and b have been wiped, but b has been rematerialized

    def _validate_materialization(asset_key, event, expected_tags):
        assert isinstance(event, EventLogEntry)
        assert event.dagster_event
        assert event.dagster_event.is_step_materialization
        assert event.dagster_event.step_materialization_data.materialization.asset_key == asset_key
        assert event.dagster_event.step_materialization_data.materialization.tags == expected_tags

    b = AssetKey("b")

    with copy_directory(src_dir) as test_dir:
        with DagsterInstance.from_ref(
                InstanceRef.from_dir(test_dir)) as instance:
            storage = instance.event_log_storage

            records = storage.get_asset_records([b])
            asset_entry = records[0].asset_entry
            assert asset_entry.asset_key == b
            _validate_materialization(b,
                                      asset_entry.last_materialization,
                                      expected_tags={})
Example #7
0
    def build_for_materialization(materialization):
        class DummyIOManager(IOManager):
            def __init__(self):
                self.values = {}

            def handle_output(self, context, obj):
                keys = tuple(context.get_output_identifier())
                self.values[keys] = obj

                context.add_output_metadata({"foo": "bar"})
                yield MetadataEntry("baz", value="baz")
                context.add_output_metadata({"bar": "bar"})
                yield materialization

            def load_input(self, context):
                keys = tuple(context.upstream_output.get_output_identifier())
                return self.values[keys]

        @op(out=Out(asset_key=AssetKey("key_on_out")))
        def the_op():
            return 5

        @graph
        def the_graph():
            the_op()

        return the_graph.execute_in_process(
            resources={"io_manager": DummyIOManager()})
Example #8
0
def test_source_asset():
    @asset
    def asset1(source1):
        assert source1 == 5
        return 1

    class MyIOManager(IOManager):
        def handle_output(self, context, obj):
            pass

        def load_input(self, context):
            assert context.resource_config["a"] == 7
            assert context.resources.subresource == 9
            assert context.upstream_output.resources.subresource == 9
            return 5

    @io_manager(config_schema={"a": int},
                required_resource_keys={"subresource"})
    def my_io_manager(_):
        return MyIOManager()

    job = build_assets_job(
        "a",
        [asset1],
        source_assets=[
            SourceAsset(AssetKey("source1"),
                        io_manager_key="special_io_manager")
        ],
        resource_defs={
            "special_io_manager": my_io_manager.configured({"a": 7}),
            "subresource": ResourceDefinition.hardcoded_resource(9),
        },
    )
    assert job.graph.node_defs == [asset1.op]
    assert job.execute_in_process().success
Example #9
0
def test_asset_group_source_asset():
    foo_fa = SourceAsset(key=AssetKey("foo"), io_manager_key="the_manager")

    @asset
    def asset_depends_on_source(foo):
        return foo

    class MyIOManager(IOManager):
        def handle_output(self, context, obj):
            pass

        def load_input(self, context):
            return 5

    @io_manager
    def the_manager():
        return MyIOManager()

    group = AssetGroup(
        assets=[asset_depends_on_source],
        source_assets=[foo_fa],
        resource_defs={"the_manager": the_manager},
    )

    @repository
    def the_repo():
        return [group]

    asset_group_underlying_job = the_repo.get_all_jobs()[0]
    assert asset_group_underlying_job.name == group.all_assets_job_name

    result = asset_group_underlying_job.execute_in_process()
    assert result.success
Example #10
0
def test_asset_group_from_list():
    @asset
    def asset_foo():
        return "foo"

    @asset
    def asset_bar():
        return "bar"

    @asset(ins={"asset_bar": AssetIn(asset_key=AssetKey("asset_foo"))})
    def last_asset(asset_bar):
        return asset_bar

    group = AssetGroup(assets=[asset_foo, asset_bar, last_asset])

    @repository
    def the_repo():
        return [group]

    assert len(the_repo.get_all_jobs()) == 1
    asset_group_underlying_job = the_repo.get_all_jobs()[0]
    assert asset_group_underlying_job.name == group.all_assets_job_name

    result = asset_group_underlying_job.execute_in_process()
    assert result.success
Example #11
0
    def toy_asset_sensor(context):
        events = context.instance.events_for_asset_key(
            AssetKey(["model"]),
            after_cursor=context.cursor,
            ascending=False,
            limit=1)

        if not events:
            return

        record_id, event = events[0]  # take the most recent materialization
        from_pipeline = event.pipeline_name

        yield RunRequest(
            run_key=str(record_id),
            run_config={
                "solids": {
                    "read_materialization": {
                        "config": {
                            "asset_key": ["model"],
                            "pipeline": from_pipeline
                        }
                    }
                }
            },
        )

        context.update_cursor(str(record_id))
Example #12
0
    def get_asset_materializations(
            self, dbt_output: DbtOutput) -> List[AssetMaterialization]:
        ret = []

        # dbt_output.result contains the parsed contents of the results.json file
        # Note that the json schema can change from version to version. This is written for
        # https://schemas.getdbt.com/dbt/run-results/v2.json (also will work with v1.json)
        for result in dbt_output.result["results"]:
            if result["status"] != "success":
                continue
            unique_id = result["unique_id"]

            # Here, we choose a naming scheme for our asset keys that will look something like
            # <asset prefix> / model / <dbt project> / <model name>, but this is pretty arbitrary
            asset_key = AssetKey(self._asset_key_prefix + unique_id.split("."))

            # create an AssetMaterialization with our key and metadata
            ret.append(
                AssetMaterialization(
                    description=f"dbt node: {unique_id}",
                    metadata_entries=self._get_metadata(result),
                    asset_key=asset_key,
                ))

        return ret
    def handle_output(self, context, obj):
        file_path = os.path.join("my_base_dir", context.step_key, context.name)

        obj.to_csv(file_path)

        yield AssetMaterialization(asset_key=AssetKey(file_path),
                                   description="Persisted result to storage.")
Example #14
0
    def test_asset_materialization(self, storage):
        asset_key = AssetKey(["path", "to", "asset_one"])

        @solid
        def materialize_one(_):
            yield AssetMaterialization(
                asset_key=asset_key,
                metadata={
                    "text": "hello",
                    "json": {
                        "hello": "world"
                    },
                    "one_float": 1.0,
                    "one_int": 1,
                },
            )
            yield Output(1)

        def _solids():
            materialize_one()

        events_one, _ = _synthesize_events(_solids)
        for event in events_one:
            storage.store_event(event)

        assert asset_key in set(storage.all_asset_keys())
        events = storage.get_asset_events(asset_key)
        assert len(events) == 1
        event = events[0]
        assert isinstance(event, EventRecord)
        assert event.dagster_event.event_type_value == DagsterEventType.ASSET_MATERIALIZATION.value
Example #15
0
def test_multiple_definition_fails():
    class MyIOManager(IOManager):
        def handle_output(self, context, obj):
            # store asset
            return

        def load_input(self, context):
            return None

        def get_output_asset_key(self, context):
            return AssetKey([context.step_key, context.name])

    @io_manager
    def my_io_manager(_):
        return MyIOManager()

    @solid(output_defs=[
        OutputDefinition(asset_key=AssetKey("x"),
                         io_manager_key="asset_io_manager"),
    ])
    def fail_solid(_):
        return 1

    @pipeline(mode_defs=[
        ModeDefinition(resource_defs={"asset_io_manager": my_io_manager})
    ])
    def my_pipeline():
        fail_solid()

    with pytest.raises(DagsterInvariantViolationError):
        execute_pipeline(my_pipeline)
Example #16
0
def test_asset_events(asset_aware_context):
    with asset_aware_context() as ctx:
        instance, event_log_storage = ctx
        execute_pipeline(pipeline_one, instance=instance)
        execute_pipeline(pipeline_two, instance=instance)
        asset_events = event_log_storage.get_asset_events(AssetKey("asset_1"))
        assert len(asset_events) == 2
        for event in asset_events:
            assert isinstance(event, EventRecord)
            assert event.is_dagster_event
            assert event.dagster_event.event_type == DagsterEventType.ASSET_MATERIALIZATION
            assert event.dagster_event.asset_key

        asset_events = event_log_storage.get_asset_events(
            AssetKey(["path", "to", "asset_3"]))
        assert len(asset_events) == 1
Example #17
0
def test_asset_run_ids(asset_aware_context):
    with asset_aware_context() as ctx:
        instance, event_log_storage = ctx
        one = execute_pipeline(pipeline_one, instance=instance)
        two = execute_pipeline(pipeline_two, instance=instance)
        run_ids = event_log_storage.get_asset_run_ids(AssetKey("asset_1"))
        assert set(run_ids) == set([one.run_id, two.run_id])
Example #18
0
def test_asset_key_structure():
    src_dir = file_relative_path(__file__, "compat_tests/snapshot_0_9_16_asset_key_structure")
    with copy_directory(src_dir) as test_dir:
        asset_storage = ConsolidatedSqliteEventLogStorage(test_dir)
        asset_keys = asset_storage.get_all_asset_keys()
        assert len(asset_keys) == 5

        # get a structured asset key
        asset_key = AssetKey(["dashboards", "cost_dashboard"])

        # check that backcompat events are read
        assert asset_storage.has_asset_key(asset_key)
        events = asset_storage.get_asset_events(asset_key)
        assert len(events) == 1
        run_ids = asset_storage.get_asset_run_ids(asset_key)
        assert len(run_ids) == 1

        # check that backcompat events are merged with newly stored events
        run_id = "fake_run_id"
        asset_storage.store_event(_materialization_event_record(run_id, asset_key))
        assert asset_storage.has_asset_key(asset_key)
        events = asset_storage.get_asset_events(asset_key)
        assert len(events) == 2
        run_ids = asset_storage.get_asset_run_ids(asset_key)
        assert len(run_ids) == 2
Example #19
0
def test_precedence():
    @solid(input_defs=[
        InputDefinition(
            "arg_b",
            dagster_type=str,
            default_value="hi",
            description="legit",
            metadata={"explicit": True},
            root_manager_key="rudy",
            asset_key=AssetKey("table_1"),
            asset_partitions={"0"},
        )
    ])
    def precedence(_context, arg_a: int, arg_b: int, arg_c: int):
        """
        Testing

        Args:
            arg_b: boo
        """
        return arg_a + arg_b + arg_c

    assert precedence.input_defs[0].name == "arg_b"
    assert (precedence.input_defs[0].dagster_type == InputDefinition(
        "test", dagster_type=str).dagster_type)
    assert precedence.input_defs[0].description == "legit"
    assert precedence.input_defs[0].default_value == "hi"
    assert precedence.input_defs[0].metadata["explicit"]
    assert precedence.input_defs[0].root_manager_key == "rudy"
    assert precedence.input_defs[0].get_asset_key(None) is not None
    assert precedence.input_defs[0].get_asset_partitions(None) is not None
Example #20
0
def test_same_asset_in_multiple_pipelines():
    @asset
    def asset1():
        return 1

    @pipeline
    def graph1():
        asset1()

    @pipeline
    def graph2():
        asset1()

    external_asset_nodes = external_asset_graph_from_defs(
        [graph1, graph2], foreign_assets_by_key={})

    assert external_asset_nodes == [
        ExternalAssetNode(
            asset_key=AssetKey("asset1"),
            dependencies=[],
            depended_by=[],
            op_name="asset1",
            op_description=None,
            job_names=["graph1", "graph2"],
        ),
    ]
Example #21
0
def test_input_namespace():
    @asset(ins={"arg1": AssetIn(namespace="abc")})
    def my_asset(arg1):
        assert arg1

    assert my_asset.op.input_defs[0].get_asset_key(None) == AssetKey(
        ["abc", "arg1"])
Example #22
0
def solid_asset_tags(_):
    yield AssetMaterialization(asset_key=AssetKey("asset_tags"),
                               tags={
                                   "foo": "FOO",
                                   "bar": "BAR"
                               })
    yield Output(1)
Example #23
0
def test_foreign_asset():
    @asset
    def asset1(source1):
        assert source1 == 5
        return 1

    class MyIOManager(IOManager):
        def handle_output(self, context, obj):
            pass

        def load_input(self, context):
            return 5

    @io_manager
    def my_io_manager(_):
        return MyIOManager()

    job = build_assets_job(
        "a",
        [asset1],
        source_assets=[
            ForeignAsset(AssetKey("source1"),
                         io_manager_key="special_io_manager")
        ],
        resource_defs={"special_io_manager": my_io_manager},
    )
    assert job.graph.node_defs == [asset1.op]
    assert job.execute_in_process().success
Example #24
0
def test_asset_materialization(conn_string):
    event_log_storage = PostgresEventLogStorage.create_clean_storage(
        conn_string)

    asset_key = AssetKey(['path', 'to', 'asset_one'])

    @solid
    def materialize_one(_):
        yield Materialization(
            label='one',
            asset_key=asset_key,
            metadata_entries=[
                EventMetadataEntry.text('hello', 'text'),
                EventMetadataEntry.json({'hello': 'world'}, 'json'),
                EventMetadataEntry.float(1.0, 'one'),
            ],
        )
        yield Output(1)

    def _solids():
        materialize_one()

    events_one, _ = synthesize_events(_solids)
    for event in events_one:
        event_log_storage.store_event(event)

    assert asset_key in set(event_log_storage.get_all_asset_keys())
    events = event_log_storage.get_asset_events(asset_key)
    assert len(events) == 1
    event = events[0]
    assert isinstance(event, DagsterEventRecord)
    assert event.dagster_event.event_type_value == DagsterEventType.STEP_MATERIALIZATION.value
Example #25
0
 def get_output_asset_key(self, context):
     return AssetKey(
         [
             "my_database",
             context.metadata["table_name"],
         ]
     )
Example #26
0
 def backcompat_materialize(_):
     yield Materialization(
         asset_key="all_types",
         description="a materialization with all metadata types",
         metadata_entries=[
             MetadataEntry("text", value="text is cool"),
             MetadataEntry(
                 "url", value=MetadataValue.url("https://bigty.pe/neato")),
             MetadataEntry("path",
                           value=MetadataValue.path("/tmp/awesome")),
             MetadataEntry("json", value={"is_dope": True}),
             MetadataEntry(
                 "python class",
                 value=MetadataValue.python_artifact(MetadataEntry)),
             MetadataEntry(
                 "python function",
                 value=MetadataValue.python_artifact(file_relative_path)),
             MetadataEntry("float", value=1.2),
             MetadataEntry("int", value=1),
             MetadataEntry("float NaN", value=float("nan")),
             MetadataEntry("long int", value=LONG_INT),
             MetadataEntry("pipeline run",
                           value=MetadataValue.pipeline_run("fake_run_id")),
             MetadataEntry("my asset", value=AssetKey("my_asset")),
         ],
     )
     yield Output(None)
Example #27
0
    def test_asset_wipe(self, graphql_context):
        _create_run(graphql_context, "single_asset_pipeline")
        _create_run(graphql_context, "multi_asset_pipeline")

        asset_keys = graphql_context.instance.all_asset_keys()
        assert AssetKey("a") in asset_keys

        result = execute_dagster_graphql(
            graphql_context, WIPE_ASSETS, variables={"assetKeys": [{"path": ["a"]}]}
        )

        assert result.data
        assert result.data["wipeAssets"]
        assert result.data["wipeAssets"]["__typename"] == "AssetWipeSuccess"

        asset_keys = graphql_context.instance.all_asset_keys()
        assert AssetKey("a") not in asset_keys
Example #28
0
def test_asset_group_from_current_module():
    group = AssetGroup.from_current_module()
    assert {asset.op.name for asset in group.assets} == {"asset_in_current_module"}
    assert len(group.assets) == 1
    assert {source_asset.key for source_asset in group.source_assets} == {
        AssetKey("source_asset_in_current_module")
    }
    assert len(group.source_assets) == 1
Example #29
0
def test_input_asset_key_and_namespace():
    with pytest.raises(check.CheckError,
                       match="key and namespace cannot both be set"):

        @asset(
            ins={"arg1": AssetIn(asset_key=AssetKey("foo"), namespace="bar")})
        def my_asset(arg1):
            assert arg1
Example #30
0
def test_cross_pipeline_asset_dependency():
    @asset
    def asset1():
        return 1

    @asset
    def asset2(asset1):
        assert asset1 == 1

    @pipeline
    def asset1_graph():
        asset1()

    @pipeline
    def asset2_graph():
        asset2()  # pylint: disable=no-value-for-parameter

    external_asset_nodes = external_asset_graph_from_defs(
        [asset1_graph, asset2_graph], foreign_assets_by_key={})

    assert external_asset_nodes == [
        ExternalAssetNode(
            asset_key=AssetKey("asset1"),
            dependencies=[],
            depended_by=[
                ExternalAssetDependedBy(
                    downstream_asset_key=AssetKey("asset2"),
                    input_name="asset1")
            ],
            op_name="asset1",
            op_description=None,
            job_names=["asset1_graph"],
        ),
        ExternalAssetNode(
            asset_key=AssetKey("asset2"),
            dependencies=[
                ExternalAssetDependency(upstream_asset_key=AssetKey("asset1"),
                                        input_name="asset1")
            ],
            depended_by=[],
            op_name="asset2",
            op_description=None,
            job_names=["asset2_graph"],
        ),
    ]