Example #1
0
def test_multi_asset_asset_materialization_planned_events():
    @multi_asset(
        outs={
            "my_out_name": Out(asset_key=AssetKey("my_asset_name")),
            "my_other_out_name": Out(asset_key=AssetKey("my_other_asset")),
        }
    )
    def my_asset():
        yield Output(1, "my_out_name")
        yield Output(2, "my_other_out_name")

    assets_job = build_assets_job("assets_job", [my_asset])

    with instance_for_test() as instance:
        result = assets_job.execute_in_process(instance=instance)
        records = instance.get_event_records(
            EventRecordsFilter(
                DagsterEventType.ASSET_MATERIALIZATION_PLANNED, AssetKey("my_asset_name")
            )
        )
        assert result.run_id == records[0].event_log_entry.run_id
        run_id = result.run_id

        assert instance.run_ids_for_asset_key(AssetKey("my_asset_name")) == [run_id]
        assert instance.run_ids_for_asset_key(AssetKey("my_other_asset")) == [run_id]
Example #2
0
def test_io_manager():
    df_value = pandas.DataFrame({"foo": ["bar", "baz"], "quux": [1, 2]})

    @asset(partitions_def=hourly_partitions)
    def pandas_df_asset():
        return df_value

    @asset(partitions_def=hourly_partitions)
    def spark_input_asset(pandas_df_asset: SparkDF):
        assert isinstance(pandas_df_asset, SparkDF)
        assert pandas_df_asset.count() == 2
        assert set(pandas_df_asset.columns) == {"foo", "quux"}
        return pandas_df_asset

    with tempfile.TemporaryDirectory() as temp_dir:
        io_manager_test_job = build_assets_job(
            "io_manager_test_job",
            assets=[pandas_df_asset, spark_input_asset],
            resource_defs={
                "pyspark":
                pyspark_resource,
                "io_manager":
                local_partitioned_parquet_io_manager.configured(
                    {"base_path": temp_dir}),
            },
        )

        expected_path = os.path.join(
            temp_dir, "pandas_df_asset-20220101160000_20220101170000.pq")
        res = io_manager_test_job.execute_in_process(
            partition_key="2022-01-01-16:00")
        assert res.success
        assert os.path.exists(expected_path)
        intermediate_df = pandas.read_parquet(expected_path)
        assert all(intermediate_df == df_value)
Example #3
0
def test_asset_materialization_planned_event_yielded():
    @asset
    def asset_one():
        raise Exception("foo")

    @asset
    def never_runs_asset(asset_one):
        return asset_one

    asset_job = build_assets_job("asset_job", [asset_one, never_runs_asset])

    with instance_for_test() as instance:
        # test with only one asset selected
        result = asset_job.execute_in_process(
            instance=instance, raise_on_error=False, op_selection=["asset_one"]
        )
        run_id = result.run_id

        assert instance.run_ids_for_asset_key(AssetKey("asset_one")) == [run_id]
        assert instance.run_ids_for_asset_key(AssetKey("never_runs_asset")) == []

    with instance_for_test() as instance:  # fresh event log storage
        # test with both assets selected
        result = asset_job.execute_in_process(instance=instance, raise_on_error=False)
        run_id = result.run_id

        assert instance.run_ids_for_asset_key(AssetKey("asset_one")) == [run_id]
        assert instance.run_ids_for_asset_key(AssetKey("never_runs_asset")) == [run_id]
def test_download():
    with tempfile.TemporaryDirectory() as temp_dir:
        test_job = build_assets_job(
            "test_job",
            assets=ASSETS,
            resource_defs={
                "io_manager":
                fs_io_manager,
                "partition_start":
                ResourceDefinition.string_resource(),
                "partition_end":
                ResourceDefinition.string_resource(),
                "parquet_io_manager":
                local_partitioned_parquet_io_manager.configured(
                    {"base_path": temp_dir}),
                "warehouse_io_manager":
                mem_io_manager,
                "pyspark":
                pyspark_resource,
                "hn_client":
                hn_snapshot_client,
            },
        )
        result = test_job.execute_in_process(partition_key="2020-12-30-00:00")

        assert result.success
Example #5
0
def define_assets_job():
    @asset
    def asset1():
        return 1

    @asset
    def asset2(asset1):
        return asset1 + 1

    return build_assets_job(
        name="assets",
        assets=[asset1, asset2],
        resource_defs={
            "io_manager": s3_pickle_asset_io_manager,
            "s3": s3_test_resource,
        },
    )
Example #6
0
def define_assets_job(bucket):
    @asset
    def asset1():
        return 1

    @asset
    def asset2(asset1):
        return asset1 + 1

    @asset(partitions_def=StaticPartitionsDefinition(["apple", "orange"]))
    def partitioned():
        return 8

    return build_assets_job(
        name="assets",
        assets=[asset1, asset2, partitioned],
        resource_defs={
            "io_manager": s3_pickle_asset_io_manager.configured({"s3_bucket": bucket}),
            "s3": s3_test_resource,
        },
    )
Example #7
0
def test_io_manager_single_partition_add_input_metadata():
    partitions_def = StaticPartitionsDefinition(["a", "b", "c"])

    @asset(partitions_def=partitions_def)
    def asset_1():
        return 1

    @asset(partitions_def=partitions_def)
    def asset_2(asset_1):
        return asset_1 + 1

    class MyIOManager(IOManager):
        def handle_output(self, context, obj):
            pass

        def load_input(self, context):
            context.add_input_metadata(metadata={"foo": "bar"}, description="hello world")
            return 1

    @io_manager
    def my_io_manager(_):
        return MyIOManager()

    assets_job = build_assets_job(
        "assets_job", [asset_1, asset_2], resource_defs={"io_manager": my_io_manager}
    )
    result = assets_job.execute_in_process(partition_key="a")

    get_observation = lambda event: event.event_specific_data.asset_observation

    observations = [
        event for event in result.all_node_events if event.event_type_value == "ASSET_OBSERVATION"
    ]

    assert observations[0].step_key == "asset_2"
    assert get_observation(observations[0]) == AssetObservation(
        asset_key="asset_1", metadata={"foo": "bar"}, description="hello world", partition="a"
    )
Example #8
0
class LocalFileSystemIOManager(IOManager):
    """Translates between Pandas DataFrames and CSVs on the local filesystem."""
    def _get_fs_path(self, asset_key: AssetKey) -> str:
        rpath = os.path.join(*asset_key.path) + ".csv"
        return os.path.abspath(rpath)

    def handle_output(self, context, obj: DataFrame):
        """This saves the dataframe as a CSV."""
        fpath = self._get_fs_path(context.asset_key)
        obj.to_csv(fpath)

    def load_input(self, context):
        """This reads a dataframe from a CSV."""
        fpath = self._get_fs_path(context.asset_key)
        return pd.read_csv(fpath)


# io_manager_end

# build_assets_job_start
weather_job = build_assets_job(
    "weather",
    assets=[daily_temperature_highs, hottest_dates],
    source_assets=[sfo_q2_weather_sample],
    resource_defs={
        "io_manager":
        IOManagerDefinition.hardcoded_io_manager(LocalFileSystemIOManager())
    },
)
# build_assets_job_end
Example #9
0
import random
from typing import Sequence

from dagster import AssetKey, asset, build_assets_job

N_ASSETS = 1000


def generate_big_honkin_assets() -> Sequence:
    random.seed(5438790)
    assets = []

    for i in range(N_ASSETS):
        non_argument_deps = {
            AssetKey(f"asset_{j}")
            for j in random.sample(range(i), min(i, random.randint(0, 3)))
        }

        @asset(name=f"asset_{i}", non_argument_deps=non_argument_deps)
        def some_asset():
            pass

        assets.append(some_asset)

    return assets


big_honkin_assets_job = build_assets_job("big_honkin_assets_job",
                                         generate_big_honkin_assets())
Example #10
0
def test_assets(schema_prefix):

    ab_resource = airbyte_resource(
        build_init_resource_context(config={
            "host": "some_host",
            "port": "8000",
        }))
    destination_tables = ["foo", "bar"]
    if schema_prefix:
        destination_tables = [schema_prefix + t for t in destination_tables]
    ab_assets = build_airbyte_assets(
        "12345",
        destination_tables=destination_tables,
        asset_key_prefix=["some", "prefix"],
    )

    assert ab_assets[0].asset_keys == {
        AssetKey(["some", "prefix", t])
        for t in destination_tables
    }
    assert len(ab_assets[0].op.output_defs) == 2

    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/connections/get",
        json=get_sample_connection_json(prefix=schema_prefix),
        status=200,
    )
    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/connections/sync",
        json={"job": {
            "id": 1
        }},
        status=200,
    )
    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/jobs/get",
        json=get_sample_job_json(schema_prefix=schema_prefix),
        status=200,
    )

    ab_job = build_assets_job(
        "ab_job",
        ab_assets,
        resource_defs={
            "airbyte":
            airbyte_resource.configured({
                "host": "some_host",
                "port": "8000",
            })
        },
    )

    res = ab_job.execute_in_process()

    materializations = [
        event.event_specific_data.materialization
        for event in res.events_for_node("airbyte_sync_12345")
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 3
    assert {m.asset_key
            for m in materializations} == {
                AssetKey(["some", "prefix", schema_prefix + "foo"]),
                AssetKey(["some", "prefix", schema_prefix + "bar"]),
                AssetKey(["some", "prefix", schema_prefix + "baz"]),
            }
    assert MetadataEntry("bytesEmitted",
                         value=1234) in materializations[0].metadata_entries
    assert MetadataEntry("recordsCommitted",
                         value=4321) in materializations[0].metadata_entries
    assert (MetadataEntry(
        "schema",
        value=TableSchema(columns=[
            TableColumn(name="a", type="str"),
            TableColumn(name="b", type="int"),
        ]),
    ) in materializations[0].metadata_entries)
Example #11
0
def test_assets():

    ab_resource = airbyte_resource(
        build_init_resource_context(config={
            "host": "some_host",
            "port": "8000",
        }))
    ab_assets = build_airbyte_assets("12345", ["foo", "bar"],
                                     asset_key_prefix=["some", "prefix"])

    assert len(ab_assets[0].op.output_defs) == 2

    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/connections/get",
        json={
            "name": "xyz",
            "syncCatalog": {
                "streams": [
                    {
                        "stream": {
                            "name": "foo",
                            "jsonSchema": {
                                "properties": {
                                    "a": {
                                        "type": "str"
                                    },
                                    "b": {
                                        "type": "int"
                                    }
                                }
                            },
                        },
                        "config": {
                            "selected": True
                        },
                    },
                    {
                        "stream": {
                            "name": "bar",
                            "jsonSchema": {
                                "properties": {
                                    "c": {
                                        "type": "str"
                                    },
                                }
                            },
                        },
                        "config": {
                            "selected": True
                        },
                    },
                    {
                        "stream": {
                            "name": "baz",
                            "jsonSchema": {
                                "properties": {
                                    "d": {
                                        "type": "str"
                                    },
                                }
                            },
                        },
                        "config": {
                            "selected": True
                        },
                    },
                ]
            },
        },
        status=200,
    )
    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/connections/sync",
        json={"job": {
            "id": 1
        }},
        status=200,
    )
    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/jobs/get",
        json={
            "job": {
                "id": 1,
                "status": AirbyteState.SUCCEEDED
            },
            "attempts": [{
                "attempt": {
                    "streamStats": [
                        {
                            "streamName": "foo",
                            "stats": {
                                "bytesEmitted": 1234,
                                "recordsCommitted": 4321,
                            },
                        },
                        {
                            "streamName": "bar",
                            "stats": {
                                "bytesEmitted": 1234,
                                "recordsCommitted": 4321,
                            },
                        },
                        {
                            "streamName": "baz",
                            "stats": {
                                "bytesEmitted": 1111,
                                "recordsCommitted": 1111,
                            },
                        },
                    ]
                }
            }],
        },
        status=200,
    )

    ab_job = build_assets_job(
        "ab_job",
        ab_assets,
        resource_defs={
            "airbyte":
            airbyte_resource.configured({
                "host": "some_host",
                "port": "8000",
            })
        },
    )

    res = ab_job.execute_in_process()

    materializations = [
        event for event in res.events_for_node("airbyte_sync_12345")
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 3
    assert (MetadataEntry.text("a,b", "columns") in materializations[0].
            event_specific_data.materialization.metadata_entries)
    assert (MetadataEntry.int(1234, "bytesEmitted") in materializations[0].
            event_specific_data.materialization.metadata_entries)
    assert (MetadataEntry.int(4321, "recordsCommitted") in materializations[0].
            event_specific_data.materialization.metadata_entries)
Example #12
0
def test_fivetran_asset_run(tables, should_error):

    ft_resource = fivetran_resource.configured({
        "api_key": "foo",
        "api_secret": "bar"
    })
    final_data = {"succeeded_at": "2021-01-01T02:00:00.0Z"}
    api_prefix = f"{FIVETRAN_API_BASE}/{FIVETRAN_CONNECTOR_PATH}{DEFAULT_CONNECTOR_ID}"

    fivetran_assets = build_fivetran_assets(
        connector_id=DEFAULT_CONNECTOR_ID,
        destination_tables=tables,
        poll_interval=0.1,
        poll_timeout=10,
    )

    # expect the multi asset to have one asset key and one output for each specified asset key
    assert fivetran_assets[0].asset_keys == {
        AssetKey(table.split("."))
        for table in tables
    }
    assert len(fivetran_assets[0].op.output_defs) == len(tables)

    fivetran_assets_job = build_assets_job(
        name="fivetran_assets_job",
        assets=fivetran_assets,
        resource_defs={"fivetran": ft_resource},
    )

    with responses.RequestsMock() as rsps:
        rsps.add(rsps.PATCH, api_prefix, json=get_sample_update_response())
        rsps.add(rsps.POST,
                 f"{api_prefix}/force",
                 json=get_sample_sync_response())
        # connector schema
        rsps.add(
            rsps.GET,
            f"{api_prefix}/schemas",
            json=get_sample_connector_schema_config(tables=[
                ("schema1", "tracked"),
                ("schema1", "untracked"),
                ("schema2", "tracked"),
            ]),
        )
        # initial state
        rsps.add(rsps.GET, api_prefix, json=get_sample_connector_response())
        # final state will be updated
        rsps.add(rsps.GET,
                 api_prefix,
                 json=get_sample_connector_response(data=final_data))

        if should_error:
            with pytest.raises(DagsterStepOutputNotFoundError):
                fivetran_assets_job.execute_in_process()
        else:
            result = fivetran_assets_job.execute_in_process()
            assert result.success
            # make sure we only have outputs for the explicit asset keys
            outputs = [
                event for event in result.events_for_node(
                    f"fivetran_sync_{DEFAULT_CONNECTOR_ID}")
                if event.event_type_value == "STEP_OUTPUT"
            ]
            assert len(outputs) == len(tables)

            # make sure we have asset materializations for all the schemas/tables that were actually sync'd
            asset_materializations = [
                event for event in result.events_for_node(
                    f"fivetran_sync_{DEFAULT_CONNECTOR_ID}")
                if event.event_type_value == "ASSET_MATERIALIZATION"
            ]
            assert len(asset_materializations) == 3
            found_asset_keys = set(
                mat.event_specific_data.materialization.asset_key
                for mat in asset_materializations)
            assert found_asset_keys == {
                AssetKey(["schema1", "tracked"]),
                AssetKey(["schema1", "untracked"]),
                AssetKey(["schema2", "tracked"]),
            }
                    "memory": "2Gi"
                },
            }
        },
    }
}

ASSETS = [id_range_for_time, items, comments, stories]

download_prod_job = build_assets_job(
    "hacker_news_api_download",
    assets=ASSETS,
    resource_defs={
        **{
            "hn_client": hn_api_subsample_client.configured({
                "sample_rate": 10
            })
        },
        **RESOURCES_PROD,
    },
    tags=DOWNLOAD_TAGS,
)

download_staging_job = build_assets_job(
    "hacker_news_api_download",
    assets=ASSETS,
    resource_defs={
        **{
            "hn_client": hn_api_subsample_client.configured({
                "sample_rate": 10
            })
Example #14
0
        "Rows": num_rows[0]
    }


# this list has one element per dbt model
assets = load_assets_from_dbt_manifest(
    json.load(open(os.path.join(DBT_PROJECT_DIR, "target", "manifest.json"))),
    runtime_metadata_fn=asset_metadata,
    io_manager_key="warehouse_io_manager",
)
activity_stats_staging_job = build_assets_job(
    "activity_stats",
    assets,
    [],
    resource_defs={
        **RESOURCES_STAGING,
        **{
            "dbt": dbt_prod_resource
        }
    },
)

activity_stats_prod_job = build_assets_job(
    "activity_stats",
    assets,
    [],
    resource_defs={
        **RESOURCES_PROD,
        **{
            "dbt": dbt_prod_resource
        }
Example #15
0
from hacker_news_assets.assets.comment_stories import comment_stories
from hacker_news_assets.assets.items import comments, stories
from hacker_news_assets.assets.recommender_model import component_top_stories, recommender_model
from hacker_news_assets.assets.user_story_matrix import user_story_matrix
from hacker_news_assets.assets.user_top_recommended_stories import user_top_recommended_stories
from hacker_news_assets.resources import RESOURCES_PROD, RESOURCES_STAGING

assets = [
    comment_stories,
    user_story_matrix,
    recommender_model,
    component_top_stories,
    user_top_recommended_stories,
]

source_assets = [comments, stories]

story_recommender_prod_job = build_assets_job(
    "story_recommender",
    assets=assets,
    source_assets=source_assets,
    resource_defs=RESOURCES_PROD,
)

story_recommender_staging_job = build_assets_job(
    "story_recommender",
    assets=assets,
    source_assets=source_assets,
    resource_defs=RESOURCES_STAGING,
)
Example #16
0
# pylint: disable=redefined-outer-name
from dagster import AssetIn, asset, build_assets_job

namespace1 = ["s3", "superdomain_1", "subdomain_1", "subsubdomain_1"]


@asset(namespace=namespace1)
def asset1():
    pass


@asset(
    namespace=["s3", "superdomain_2", "subdomain_2", "subsubdomain_2"],
    ins={"asset1": AssetIn(namespace=namespace1)},
)
def asset2(asset1):
    assert asset1 is None


long_asset_keys_job = build_assets_job("long_asset_keys_job", assets=[asset1, asset2])