def test_bollinger_analysis(): bollinger_sda = AssetGroup( assets=[sp500_anomalous_events, sp500_bollinger_bands, sp500_prices], resource_defs={"io_manager": local_csv_io_manager}, ) result = bollinger_sda.build_job("test_job").execute_in_process() assert result.asset_materializations_for_node
def test_asset_io_manager(gcs_bucket): @asset def upstream(): return 2 @asset def downstream(upstream): return 1 + upstream @asset(partitions_def=StaticPartitionsDefinition(["apple", "orange"])) def partitioned(): return 8 fake_gcs_client = FakeGCSClient() asset_group = AssetGroup( [upstream, downstream, partitioned], resource_defs={ "io_manager": gcs_pickle_asset_io_manager.configured({ "gcs_bucket": gcs_bucket, "gcs_prefix": "assets" }), "gcs": ResourceDefinition.hardcoded_resource(fake_gcs_client), }, ) asset_job = asset_group.build_job(name="my_asset_job") result = asset_job.execute_in_process(partition_key="apple") assert result.success assert fake_gcs_client.get_all_blob_paths() == { f"{gcs_bucket}/assets/upstream", f"{gcs_bucket}/assets/downstream", f"{gcs_bucket}/assets/partitioned/apple", }
def foo(): return [ AssetGroup(dbt_assets_a, resource_defs={ "dbt": dbt_cli_resource }).build_job("a"), AssetGroup(dbt_assets_b, resource_defs={ "dbt": dbt_cli_resource }).build_job("b"), ]
def test_cereal_asset_group(): group = AssetGroup([ nabisco_cereals, cereals, cereal_protein_fractions, highest_protein_nabisco_cereal, ]) result = group.materialize() assert result.success assert result.output_for_node( "highest_protein_nabisco_cereal") == "100% Bran"
def make_activity_stats_job(asset_group: AssetGroup) -> JobDefinition: return asset_group.build_job( name="activity_stats", selection=[ "comment_daily_stats", "story_daily_stats", "activity_daily_stats", "activity_forecast", ], )
def test_asset_io_manager(storage_account, file_system, credential): @asset def upstream(): return 2 @asset def downstream(upstream): assert upstream == 2 return 1 + upstream asset_group = AssetGroup( [upstream, downstream], resource_defs={ "io_manager": adls2_pickle_asset_io_manager, "adls2": adls2_resource }, ) asset_job = asset_group.build_job(name="my_asset_job") run_config = { "resources": { "io_manager": { "config": { "adls2_file_system": file_system } }, "adls2": { "config": { "storage_account": storage_account, "credential": { "key": credential } } }, } } result = asset_job.execute_in_process(run_config=run_config) assert result.success
def make_download_job(asset_group: AssetGroup) -> JobDefinition: return asset_group.build_job( name="hacker_news_api_download", selection=["*comments", "*stories"], tags={ "dagster-k8s/config": { "container_config": { "resources": { "requests": {"cpu": "500m", "memory": "2Gi"}, } }, } }, )
def test_download(): with tempfile.TemporaryDirectory() as temp_dir: test_job = AssetGroup.from_package_name( "hacker_news_assets.assets", resource_defs={ "io_manager": fs_io_manager, "partition_start": ResourceDefinition.string_resource(), "partition_end": ResourceDefinition.string_resource(), "parquet_io_manager": local_partitioned_parquet_io_manager.configured( {"base_path": temp_dir} ), "warehouse_io_manager": mem_io_manager, "pyspark": pyspark_resource, "hn_client": hn_snapshot_client, "dbt": ResourceDefinition.none_resource(), }, ).build_job( "test_job", selection=["*comments", "*stories"], ) result = test_job.execute_in_process(partition_key="2020-12-30-00:00") assert result.success
def test_serial_asset_graph(): result = AssetGroup.from_modules([serial_asset_graph]).materialize() assert result.success
def load_input(self, context): assert context return DataFrame() @asset def daily_temperature_highs(sfo_q2_weather_sample: DataFrame) -> DataFrame: """Computes the temperature high for each day""" assert sfo_q2_weather_sample time.sleep(3) return DataFrame() @asset def hottest_dates(daily_temperature_highs: DataFrame) -> DataFrame: """Computes the 10 hottest dates""" assert daily_temperature_highs time.sleep(3) return DataFrame() software_defined_assets = AssetGroup( assets=[daily_temperature_highs, hottest_dates], source_assets=[sfo_q2_weather_sample], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(DummyIOManager()) }, )
# pylint: disable=redefined-outer-name # start_marker from dagster import AssetGroup, asset, fs_asset_io_manager from dagster_aws.s3 import s3_pickle_asset_io_manager, s3_resource @asset def upstream_asset(): return [1, 2, 3] @asset def downstream_asset(upstream_asset): return upstream_asset + [4] prod_asset_group = AssetGroup( [upstream_asset, downstream_asset], resource_defs={"io_manager": s3_pickle_asset_io_manager, "s3": s3_resource}, ) local_asset_group = AssetGroup( [upstream_asset, downstream_asset], resource_defs={"io_manager": fs_asset_io_manager} ) # end_marker
def bollinger(): return [ AssetGroup.from_package_name(__name__, resource_defs={"io_manager": local_csv_io_manager}) ]
# pylint: disable=redefined-outer-name # start_marker from dagster import AssetGroup, AssetKey, SourceAsset, asset my_source_asset = SourceAsset(key=AssetKey("my_source_asset")) @asset def my_derived_asset(my_source_asset): return my_source_asset + [4] asset_group = AssetGroup(assets=[my_derived_asset], source_assets=[my_source_asset]) # end_marker
partitions_def=daily_partitions_def) def downstream_daily_partitioned_asset(upstream_daily_partitioned_asset): assert upstream_daily_partitioned_asset is None @asset( metadata={"owner": "*****@*****.**"}, partitions_def=HourlyPartitionsDefinition( start_date=datetime(2022, 3, 12, 0, 0)), ) def hourly_partitioned_asset(): pass @asset( metadata={ "owner": "*****@*****.**", "text_metadata": "Text-based metadata about this asset", "path": MetadataValue.path("/unpartitioned/asset"), "dashboard_url": MetadataValue.url("http://mycoolsite.com/url_for_my_asset"), }, ) def unpartitioned_asset(): pass partitioned_asset_group = AssetGroup.from_current_module()
optimize.curve_fit(f=model_func, xdata=df.order_date.astype(np.int64), ydata=df.num_orders, p0=[10, 100])[0]) @asset(compute_kind="python", io_manager_key="pandas_io_manager") def predicted_orders( daily_order_summary: pd.DataFrame, order_forecast_model: Tuple[float, float]) -> pd.DataFrame: """Predicted orders for the next 30 days based on the fit paramters""" a, b = order_forecast_model start_date = daily_order_summary.order_date.max() future_dates = pd.date_range(start=start_date, end=start_date + pd.DateOffset(days=30)) predicted_data = model_func(x=future_dates.astype(np.int64), a=a, b=b) return pd.DataFrame({ "order_date": future_dates, "num_orders": predicted_data }) analytics_assets = AssetGroup( airbyte_assets + dbt_assets + [order_forecast_model, predicted_orders], resource_defs={ "airbyte": airbyte_resource.configured(AIRBYTE_CONFIG), "dbt": dbt_cli_resource.configured(DBT_CONFIG), "pandas_io_manager": pandas_io_manager.configured(PANDAS_IO_CONFIG), }, ).build_job("Assets")
from hacker_news_assets.resources import RESOURCES_LOCAL, RESOURCES_PROD, RESOURCES_STAGING from dagster import AssetGroup, in_process_executor prod_assets = AssetGroup.from_package_name(__name__, resource_defs=RESOURCES_PROD) staging_assets = AssetGroup.from_package_name(__name__, resource_defs=RESOURCES_STAGING) local_assets = AssetGroup.from_package_name( __name__, resource_defs=RESOURCES_LOCAL, executor_def=in_process_executor )
"""isort:skip_file""" # start_asset_marker import csv import requests from dagster import asset @asset def cereals(): response = requests.get("https://docs.dagster.io/assets/cereal.csv") lines = response.text.split("\n") cereal_rows = [row for row in csv.DictReader(lines)] return cereal_rows # end_asset_marker # start_materialize_marker from dagster import AssetGroup if __name__ == "__main__": AssetGroup([cereals]).materialize() # end_materialize_marker
from dagster import AssetGroup asset_group = AssetGroup.from_package_name( "docs_snippets.concepts.assets.package_with_assets")
def test_cereal(): result = AssetGroup.from_modules([cereal]).materialize() assert result.success
def my_repo(): return [AssetGroup(assets=[], source_assets=[foo, bar])]
def _get_fs_path(self, asset_key: AssetKey) -> str: rpath = os.path.join(*asset_key.path) + ".csv" return os.path.abspath(rpath) def handle_output(self, context, obj: DataFrame): """This saves the dataframe as a CSV.""" fpath = self._get_fs_path(context.asset_key) obj.to_csv(fpath) def load_input(self, context): """This reads a dataframe from a CSV.""" fpath = self._get_fs_path(context.asset_key) return pd.read_csv(fpath) # io_manager_end # asset_group_start # imports the module called "assets" from the package containing the current module # the "assets" module contains the asset definitions from . import assets weather_assets = AssetGroup.from_modules( modules=[assets], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(LocalFileSystemIOManager()) }, ) # asset_group_end
def make_story_recommender_job(asset_group: AssetGroup) -> JobDefinition: return asset_group.build_job( name="story_recommender", selection=["comment_stories*"], )
# pylint: disable=redefined-outer-name from dagster import AssetGroup, AssetIn, asset namespace1 = ["s3", "superdomain_1", "subdomain_1", "subsubdomain_1"] @asset(namespace=namespace1) def asset1(): pass @asset( namespace=["s3", "superdomain_2", "subdomain_2", "subsubdomain_2"], ins={"asset1": AssetIn(namespace=namespace1)}, ) def asset2(asset1): assert asset1 is None long_asset_keys_group = AssetGroup([asset1, asset2])
# pylint: disable=redefined-outer-name # start_marker from dagster import AssetGroup, asset, fs_asset_io_manager from dagster_aws.s3 import s3_pickle_asset_io_manager, s3_resource @asset(io_manager_key="s3_io_manager") def upstream_asset(): return [1, 2, 3] @asset(io_manager_key="fs_io_manager") def downstream_asset(upstream_asset): return upstream_asset + [4] asset_group = AssetGroup( [upstream_asset, downstream_asset], resource_defs={ "s3_io_manager": s3_pickle_asset_io_manager, "s3": s3_resource, "fs_io_manager": fs_asset_io_manager, }, ) # end_marker
# pylint: disable=redefined-outer-name from dagster import AssetGroup, AssetKey, SourceAsset, asset, repository @asset def upstream_asset(): return 5 upstream_asset_group = AssetGroup([upstream_asset]) @repository def upstream_assets_repository(): return [upstream_asset_group] source_assets = [SourceAsset(AssetKey("upstream_asset"))] @asset def downstream_asset1(upstream_asset): assert upstream_asset @asset def downstream_asset2(upstream_asset): assert upstream_asset downstream_asset_group1 = AssetGroup(assets=[downstream_asset1],
import random from typing import Sequence from dagster import AssetGroup, AssetKey, asset N_ASSETS = 1000 def generate_big_honkin_assets() -> Sequence: random.seed(5438790) assets = [] for i in range(N_ASSETS): non_argument_deps = { AssetKey(f"asset_{j}") for j in random.sample(range(i), min(i, random.randint(0, 3))) } @asset(name=f"asset_{i}", non_argument_deps=non_argument_deps) def some_asset(): pass assets.append(some_asset) return assets big_honkin_asset_group = AssetGroup(generate_big_honkin_assets())
# pylint: disable=redefined-outer-name # start_marker from dagster import AssetGroup, asset @asset def upstream(): return [1, 2, 3] @asset def downstream_1(upstream): return upstream + [4] @asset def downstream_2(upstream): return len(upstream) asset_group = AssetGroup([upstream, downstream_1, downstream_2]) all_assets = asset_group.build_job(name="my_asset_job") downstream_assets = asset_group.build_job( name="my_asset_job", selection=["upstream", "downstream_1"]) upstream_and_downstream_1 = asset_group.build_job(name="my_asset_job", selection="*downstream_1") # end_marker