def dbt_cli_snapshot_freshness(context) -> Dict: """This solid executes ``dbt source snapshot-freshness`` via the dbt CLI.""" cli_output = execute_cli( context.solid_config["dbt_executable"], command=("source", "snapshot-freshness"), flags_dict=passthrough_flags_only(context.solid_config, ("select", "output", "threads")), log=context.log, warn_error=context.solid_config["warn-error"], ignore_handled_error=context.solid_config["ignore_handled_error"], ) yield AssetMaterialization( asset_key="dbt_source_snapshot-freshness_cli_output", description= "Output from the CLI execution of `dbt source snapshot-freshness`.", metadata_entries=[ EventMetadataEntry.json(cli_output, label="CLI Output") ], ) yield Output(cli_output)
def dbt_cli_snapshot(context) -> Dict: """This solid executes ``dbt snapshot`` via the dbt CLI.""" cli_output = execute_cli( context.solid_config["dbt_executable"], command=("snapshot", ), flags_dict=passthrough_flags_only(context.solid_config, ("threads", "models", "exclude")), log=context.log, warn_error=context.solid_config["warn-error"], ignore_handled_error=context.solid_config["ignore_handled_error"], ) if context.solid_config["yield_materializations"]: yield AssetMaterialization( asset_key="dbt_snapshot_cli_output", description="Output from the CLI execution of `dbt snapshot`.", metadata_entries=[ EventMetadataEntry.json(cli_output, label="CLI Output") ], ) yield Output(cli_output)
def materialize(_): yield AssetMaterialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ EventMetadataEntry.text("text is cool", "text"), EventMetadataEntry.url("https://bigty.pe/neato", "url"), EventMetadataEntry.fspath("/tmp/awesome", "path"), EventMetadataEntry.json({"is_dope": True}, "json"), EventMetadataEntry.python_artifact(EventMetadataEntry, "python class"), EventMetadataEntry.python_artifact(file_relative_path, "python function"), EventMetadataEntry.float(1.2, "float"), EventMetadataEntry.int(1, "int"), EventMetadataEntry.float(float("nan"), "float NaN"), EventMetadataEntry.int(LONG_INT, "long int"), EventMetadataEntry.pipeline_run("fake_run_id", "pipeline run"), EventMetadataEntry.asset(AssetKey("my_asset"), "my asset"), ], ) yield Output(None)
def test_access_partition_keys_from_context_only_one_asset_partitioned(): upstream_partitions_def = StaticPartitionsDefinition(["a", "b", "c"]) class MyIOManager(IOManager): def handle_output(self, context, obj): if context.op_def.name == "upstream_asset": assert context.asset_partition_key == "b" elif context.op_def.name == "downstream_asset": assert not context.has_asset_partitions with pytest.raises(Exception): # TODO: better error message assert context.asset_partition_key_range else: assert False def load_input(self, context): assert not context.has_asset_partitions @asset(partitions_def=upstream_partitions_def) def upstream_asset(context): assert context.output_asset_partition_key() == "b" @asset def downstream_asset(upstream_asset): assert upstream_asset is None my_job = build_assets_job( "my_job", assets=[upstream_asset, downstream_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }, ) result = my_job.execute_in_process(partition_key="b") assert result.asset_materializations_for_node("upstream_asset") == [ AssetMaterialization(asset_key=AssetKey(["upstream_asset"]), partition="b") ]
def dbt_cli_test(context) -> DbtCliOutput: """This solid executes ``dbt test`` via the dbt CLI.""" cli_output = execute_cli( context.solid_config["dbt_executable"], command=("test",), flags_dict=passthrough_flags_only( context.solid_config, ("data", "schema", "fail-fast", "threads", "models", "exclude") ), log=context.log, warn_error=context.solid_config["warn-error"], ignore_handled_error=context.solid_config["ignore_handled_error"], ) run_results = parse_run_results(context.solid_config["project-dir"]) cli_output = {**run_results, **cli_output} if context.solid_config["yield_materializations"]: yield AssetMaterialization( asset_key="dbt_test_cli_output", description="Output from the CLI execution of `dbt test`.", metadata_entries=[EventMetadataEntry.json(cli_output, label="CLI Output")], ) yield Output(DbtCliOutput.from_dict(cli_output), output_name="dbt_output")
def _table_data_to_materialization( fivetran_output: FivetranOutput, asset_key_prefix: List[str], schema_name: str, table_data: Dict[str, Any], ) -> AssetMaterialization: table_name = table_data["name_in_destination"] asset_key = asset_key_prefix + [schema_name, table_name] if not table_data["enabled"]: return None metadata = { "connector_url": MetadataValue.url( get_fivetran_connector_url(fivetran_output.connector_details)) } if table_data.get("columns"): metadata["column_info"] = MetadataValue.json(table_data.get("columns")) return AssetMaterialization( asset_key=asset_key, description= f"Table generated via Fivetran sync: {schema_name}.{table_name}", metadata=metadata, )
def many_table_materializations(_context): with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), "r") as f: md_str = f.read() for table in raw_tables: yield AssetMaterialization( asset_key="table_info", metadata={ "table_name": table, "table_path": EventMetadata.path(f"/path/to/{table}"), "table_data": { "name": table }, "table_name_big": EventMetadata.url(f"https://bigty.pe/{table}"), "table_blurb": EventMetadata.md(md_str), "big_int": 29119888133298982934829348, "float_nan": float("nan"), }, )
def many_materializations_and_passing_expectations(_context): tables = [ "users", "groups", "events", "friends", "pages", "fans", "event_admins", "group_admins", ] for table in tables: yield AssetMaterialization( asset_key="table_info", metadata={ "table_path": EventMetadata.path(f"/path/to/{table}.raw"), }, ) yield ExpectationResult( success=True, label="{table}.row_count".format(table=table), description="Row count passed for {table}".format(table=table), )
def sort_by_calories(context, cereals): sorted_cereals = sorted(cereals, key=lambda cereal: int(cereal["calories"])) least_caloric = sorted_cereals[0]["name"] most_caloric = sorted_cereals[-1]["name"] context.log.info(f"Least caloric cereal: {least_caloric}") context.log.info(f"Most caloric cereal: {most_caloric}") fieldnames = list(sorted_cereals[0].keys()) sorted_cereals_csv_path = os.path.abspath( f"output/calories_sorted_{context.run_id}.csv") os.makedirs(os.path.dirname(sorted_cereals_csv_path), exist_ok=True) with open(sorted_cereals_csv_path, "w") as fd: writer = csv.DictWriter(fd, fieldnames) writer.writeheader() writer.writerows(sorted_cereals) yield AssetMaterialization( asset_key="sorted_cereals_csv", description="Cereals data frame sorted by caloric content", metadata={ "sorted_cereals_csv_path": EventMetadata.path(sorted_cereals_csv_path) }, ) yield Output(None)
def made_solid(context): partition_date = datetime.strptime(context.solid_config["partition"], DEFAULT_DATE_FORMAT) if data_size_fn: data_size = data_size_fn(partition_date) sleep_time = sleep_factor * data_size time.sleep(sleep_time) rand = random() if error_rate and rand < error_rate: raise IntentionalRandomFailure( f"random {rand} < error rate {error_rate}") if asset_key: metadata = { "Data size (bytes)": data_size } if data_size_fn else None yield AssetMaterialization( asset_key=asset_key, metadata=metadata, partition=context.solid_config.get("partition"), )
def _materialization_for_stream( name: str, stream_info: Dict[str, Any], stream_stats: Dict[str, Any], asset_key_prefix: List[str], ) -> AssetMaterialization: return AssetMaterialization( asset_key=asset_key_prefix + [name], metadata={ "schema": MetadataValue.table_schema( TableSchema(columns=[ TableColumn(name=name, type=str(info["type"])) for name, info in stream_info["stream"]["jsonSchema"] ["properties"].items() ])), "columns": ",".join(name for name in stream_info["stream"]["jsonSchema"] ["properties"].keys()), **{k: v for k, v in stream_stats.items() if v is not None}, }, )
def made_solid(context): partition_date = datetime.strptime(context.solid_config["partition"], DEFAULT_DATE_FORMAT) if data_size_fn: data_size = data_size_fn(partition_date) sleep_time = sleep_factor * data_size time.sleep(sleep_time) if error_rate and random() < error_rate: raise Exception("blah") if asset_key: metadata_entries = materialization_metadata_entries or [] if data_size_fn: metadata_entries.append(EventMetadataEntry.float(data_size, "Data size (bytes)")) if len(metadata_entries) == 0: metadata_entries = None yield AssetMaterialization( asset_key=asset_key, metadata_entries=metadata_entries, partition=context.solid_config.get("partition"), )
def read_file(context): relative_filename = context.solid_config["filename"] directory = context.solid_config["directory"] filename = os.path.join(directory, relative_filename) try: fstats = os.stat(filename) context.log.info("Found file {}".format(relative_filename)) yield AssetMaterialization( asset_key=AssetKey(["log_file", relative_filename]), metadata_entries=[ EventMetadataEntry.fspath(filename), EventMetadataEntry.json( { "size": fstats.st_size, "ctime": fstats.st_ctime, "mtime": fstats.st_mtime, }, "File stats", ), ], ) yield Output(relative_filename) except FileNotFoundError: context.log.error("No file found: {}".format(relative_filename))
def dbt_cli_snapshot_freshness(context) -> DbtCliResult: """This solid executes ``dbt source snapshot-freshness`` via the dbt CLI.""" logs, raw_output, return_code = execute_dbt( context.solid_config["dbt_executable"], command=("source", "snapshot-freshness"), flags_dict=passthrough_flags_only(context.solid_config, ("select", "output", "threads")), log=context.log, warn_error=context.solid_config["warn-error"], ignore_handled_error=context.solid_config["ignore_handled_error"], ) yield AssetMaterialization( asset_key= "dbt_cli_snapshot_freshness-shell_output", # TODO: Perhaps derive asset key from CLI flags? description= "The output of a shell execution of `dbt source snapshot-freshness`.", metadata_entries=[ EventMetadataEntry.float( label="return_code", value=float(return_code), description= "The return code of a shell exeuction of `dbt source snapshot-freshness`.", ), EventMetadataEntry.text( label="raw_output", text=raw_output, description= "The raw output of a shell execution of `dbt source snapshot-freshness`.", ), ], ) yield Output( DbtCliResult(logs=logs, raw_output=raw_output, return_code=return_code))
def asset_yields_observation(): yield AssetObservation(asset_key=AssetKey("asset_yields_observation"), metadata={"text": "FOO"}) yield AssetMaterialization(asset_key=AssetKey("asset_yields_observation")) yield Output(5)
def test_access_partition_keys_from_context_non_identity_partition_mapping(): upstream_partitions_def = StaticPartitionsDefinition(["1", "2", "3"]) downstream_partitions_def = StaticPartitionsDefinition(["1", "2", "3"]) class TrailingWindowPartitionMapping(PartitionMapping): """ Maps each downstream partition to two partitions in the upstream asset: itself and the preceding partition. """ def get_upstream_partitions_for_partition_range( self, downstream_partition_key_range: PartitionKeyRange, downstream_partitions_def: PartitionsDefinition, upstream_partitions_def: PartitionsDefinition, ) -> PartitionKeyRange: assert downstream_partitions_def assert upstream_partitions_def start, end = downstream_partition_key_range return PartitionKeyRange(str(max(1, int(start) - 1)), end) def get_downstream_partitions_for_partition_range( self, upstream_partition_key_range: PartitionKeyRange, downstream_partitions_def: PartitionsDefinition, upstream_partitions_def: PartitionsDefinition, ) -> PartitionKeyRange: raise NotImplementedError() class MyIOManager(IOManager): def handle_output(self, context, obj): assert context.asset_partition_key == "2" def load_input(self, context): start, end = context.asset_partition_key_range assert start, end == ("1", "2") @asset(partitions_def=upstream_partitions_def) def upstream_asset(context): assert context.output_asset_partition_key() == "2" @asset( partitions_def=downstream_partitions_def, partition_mappings={ "upstream_asset": TrailingWindowPartitionMapping() }, ) def downstream_asset(context, upstream_asset): assert context.output_asset_partition_key() == "2" assert upstream_asset is None my_job = build_assets_job( "my_job", assets=[upstream_asset, downstream_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }, ) result = my_job.execute_in_process(partition_key="2") assert result.asset_materializations_for_node("upstream_asset") == [ AssetMaterialization(AssetKey(["upstream_asset"]), partition="2") ] assert result.asset_materializations_for_node("downstream_asset") == [ AssetMaterialization(AssetKey(["downstream_asset"]), partition="2") ]
def yield_partition_materialization(): yield AssetMaterialization( asset_key=AssetKey("yield_partition_materialization"), partition="c") yield Output(5)
def save_to_file_materialization(_, cfg, value): with open(cfg["path"], "w") as ff: ff.write(str(value)) return AssetMaterialization( "path", "Wrote out value to {path}".format(path=path), metadata={"path": path} )
def test_out_of_pipeline_yield_event(): manager = Manager() assert manager.yield_event( AssetMaterialization("foo")) == AssetMaterialization("foo")
def solid_two(_): yield AssetMaterialization(asset_key=AssetKey("asset_2")) yield AssetMaterialization(asset_key=AssetKey(["path", "to", "asset_3"])) yield Output(1)
def solid_normalization(_): yield AssetMaterialization(asset_key="path/to-asset_4") yield Output(1)
def solid_one(_): yield AssetMaterialization(asset_key=AssetKey("asset_1")) yield Output(1)
def materialize_two(_): yield AssetMaterialization(asset_key=asset_key_two) yield Output(1)
def materialize_one(_): yield AssetMaterialization(asset_key=asset_key_one) yield Output(1)
def solid_asset_a(_): yield AssetMaterialization(asset_key="a") yield Output(1)
def solid_partitioned_asset(_): yield AssetMaterialization(asset_key="a", partition="partition_1") yield Output(1)
def solid_asset_b(_, num): yield AssetMaterialization(asset_key="b") time.sleep(0.1) yield AssetMaterialization(asset_key="c") yield Output(num)
def handle_output(self, context, obj): keys = tuple(context.get_run_scoped_output_identifier()) self.values[keys] = obj yield AssetMaterialization(asset_key="yield_one") yield AssetMaterialization(asset_key="yield_two")
def tag_asset_solid(_): yield AssetMaterialization(asset_key="a", tags={"foo": "FOO"}) yield Output(1)
def _stats_records(run_id): now = time.time() return [ _event_record(run_id, "A", now - 325, DagsterEventType.STEP_START), _event_record( run_id, "A", now - 225, DagsterEventType.STEP_SUCCESS, StepSuccessData(duration_ms=100000.0), ), _event_record(run_id, "B", now - 225, DagsterEventType.STEP_START), _event_record( run_id, "B", now - 175, DagsterEventType.STEP_FAILURE, StepFailureData(error=None, user_failure_data=None), ), _event_record(run_id, "C", now - 175, DagsterEventType.STEP_START), _event_record(run_id, "C", now - 150, DagsterEventType.STEP_SKIPPED), _event_record(run_id, "D", now - 150, DagsterEventType.STEP_START), _event_record( run_id, "D", now - 125, DagsterEventType.ASSET_MATERIALIZATION, StepMaterializationData(AssetMaterialization(asset_key="mat_1")), ), _event_record( run_id, "D", now - 100, DagsterEventType.STEP_EXPECTATION_RESULT, StepExpectationResultData( ExpectationResult(success=True, label="exp 1")), ), _event_record( run_id, "D", now - 75, DagsterEventType.ASSET_MATERIALIZATION, StepMaterializationData(AssetMaterialization(asset_key="mat_2")), ), _event_record( run_id, "D", now - 50, DagsterEventType.STEP_EXPECTATION_RESULT, StepExpectationResultData( ExpectationResult(success=False, label="exp 2")), ), _event_record( run_id, "D", now - 25, DagsterEventType.ASSET_MATERIALIZATION, StepMaterializationData(AssetMaterialization(asset_key="mat_3")), ), _event_record( run_id, "D", now, DagsterEventType.STEP_SUCCESS, StepSuccessData(duration_ms=150000.0), ), ]